Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

147c3f52 · zchen0211 · 63912dcc · 7506e481 · 147c3f52 · 147c3f52
106 changed file
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -19,7 +19,7 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
 proto_library(framework_proto SRCS framework.proto)

 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto proto_desc)

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -302,7 +302,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
    return grad_op_descs;  // empty vector
  }

-  grad_op_descs = OpRegistry::CreateGradOpDescs(*op_desc);
+  grad_op_descs = OpRegistry::CreateGradOpDescs(op_desc.get());

  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
  for (auto& desc : grad_op_descs) {

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -58,6 +58,8 @@ class MulOpMaker : public OpProtoAndCheckerMaker {
    AddInput("X", "A");
    AddInput("Y", "B");
    AddOutput("Out", "Out");
+    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
+    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
    AddComment("Mul");
  }
 };
@@ -440,6 +442,28 @@ TEST(Backward, simple_single_op) {
            std::vector<std::string>({f::GradVarName("b")}));
 }

+TEST(Backward, default_attribute) {
+  f::ProgramDesc *program_desc = GetNewProgramDesc();
+  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {"x"});
+  op->SetInput("Y", {"y"});
+  op->SetOutput("Out", {"out"});
+
+  AppendBackward(program, {});
+
+  ASSERT_EQ(block->AllOps().size(), 2UL);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
+
+  f::OpDescBind *grad_op = block->AllOps()[1];
+  ASSERT_EQ(grad_op->Type(), "mul_grad");
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
+}
+
 TEST(Backward, simple_mult_op) {
  f::ProgramDesc *program_desc = GetNewProgramDesc();
  f::ProgramDescBind &program = f::ProgramDescBind::Instance(program_desc);

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -74,6 +74,12 @@ void BlockDescBind::Sync() {
    for (auto &op_desc : ops_) {
      op_field.AddAllocated(op_desc->Proto());
    }
+    auto &var_field = *this->desc_->mutable_vars();
+    var_field.Clear();
+    var_field.Reserve(static_cast<int>(vars_.size()));
+    for (auto &var_desc : vars_) {
+      var_field.AddAllocated(var_desc.second->Proto());
+    }
    need_update_ = false;
  }
 }

--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <deque>
+#include <memory>
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/op_desc.h"

--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -28,7 +28,6 @@ inline DataType ToDataType(std::type_index type) {
    return DataType::INT32;
  } else {
    PADDLE_THROW("Not supported");
-    return static_cast<DataType>(-1);
  }
 }


--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
 package paddle.framework;

 enum AttrType {

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/framework/op_desc.h"
+#include <functional>
+#include <unordered_map>
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"

 namespace paddle {
 namespace framework {
@@ -25,6 +28,7 @@ OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
  inputs_ = inputs;
  outputs_ = outputs;
  attrs_ = attrs;
+  need_update_ = true;
 }

 OpDesc *OpDescBind::Proto() {
@@ -184,5 +188,38 @@ void OpDescBind::Sync() {
    need_update_ = false;
  }
 }
+
+using InferShapeFuncMap =
+    std::unordered_map<std::string /*op_type*/,
+                       std::function<void(InferShapeContext *)>>;
+
+static InferShapeFuncMap &InferShapeFuncs() {
+  static InferShapeFuncMap *g_map = nullptr;
+  if (g_map == nullptr) {
+    g_map = new InferShapeFuncMap();
+    auto &info_map = OpInfoMap::Instance();
+    // all registered kernels
+    for (auto &pair : OperatorWithKernel::AllOpKernels()) {
+      auto &info = info_map.Get(pair.first);
+      // use empty type here to avoid runtime checks.
+      auto op =
+          static_cast<OperatorWithKernel *>(info.Creator()("", {}, {}, {}));
+      g_map->insert(
+          {pair.first, [op](InferShapeContext *ctx) { op->InferShape(ctx); }});
+    }
+  }
+  return *g_map;
+}
+
+void OpDescBind::InferShape(const BlockDescBind &block) const {
+  auto &funcs = InferShapeFuncs();
+  auto it = funcs.find(this->Type());
+  if (it == funcs.end()) {
+    PADDLE_THROW("Operator %s has not been registered", this->Type());
+  }
+  CompileTimeInferShapeContext ctx(*this, block);
+  it->second(&ctx);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -52,8 +52,6 @@ class OpDescBind {
  void SetOutput(const std::string &param_name,
                 const std::vector<std::string> &args);

-  std::string DebugString() { return this->Proto()->DebugString(); }
-
  bool HasAttr(const std::string &name) const {
    return attrs_.find(name) != attrs_.end();
  }
@@ -97,6 +95,13 @@ class OpDescBind {

  const VariableNameMap &Outputs() const { return outputs_; }

+  AttributeMap *MutableAttrMap() {
+    this->need_update_ = true;
+    return &this->attrs_;
+  }
+
+  void InferShape(const BlockDescBind &block) const;
+
 private:
  template <typename MapType>
  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {

--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -60,9 +60,14 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
 }

 std::vector<std::unique_ptr<OpDescBind>> OpRegistry::CreateGradOpDescs(
-    const OpDescBind& op_desc) {
-  auto& info = OpInfoMap::Instance().Get(op_desc.Type());
-  return info.grad_op_maker_(op_desc);
+    OpDescBind* op_desc) {
+  auto& info = OpInfoMap::Instance().Get(op_desc->Type());
+
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(*op_desc->MutableAttrMap());
+  }
+
+  return info.grad_op_maker_(*op_desc);
 }

 }  // namespace framework

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -80,7 +80,7 @@ class OpRegistry {
  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);

  static std::vector<std::unique_ptr<OpDescBind>> CreateGradOpDescs(
-      const OpDescBind& op_desc);
+      OpDescBind* op_desc);

  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
 };

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -205,13 +205,13 @@ void OperatorBase::GenerateTemporaryNames() {
 }

 template <>
-const Tensor* InferShapeContext::Input<Tensor>(const std::string& name) const {
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  auto* var = InputVar(name);
  return var == nullptr ? nullptr : GetTensorFromVar(var);
 }

 template <>
-const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
  auto names = op().Inputs(name);
  std::vector<const Tensor*> res;
@@ -225,13 +225,13 @@ const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
 }

 template <>
-Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const {
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
  auto var = OutputVar(name);
  return var == nullptr ? nullptr : var->GetMutable<LoDTensor>();
 }

 template <>
-std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const {
  auto names = op().Outputs(name);
  std::vector<Tensor*> res;

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -57,7 +57,6 @@ inline std::string GradVarName(const std::string& var_name) {
 }

 class OperatorBase;
-class InferShapeContext;
 class ExecutionContext;

 extern const Tensor* GetTensorFromVar(const Variable* var);
@@ -169,10 +168,11 @@ class NOP : public OperatorBase {
  }
 };

-class InferShapeContext {
+class ExecutionContext {
 public:
-  InferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
+                   const platform::DeviceContext& device_context)
+      : op_(op), scope_(scope), device_context_(device_context) {}

  const OperatorBase& op() const { return op_; }

@@ -278,31 +278,6 @@ class InferShapeContext {
    out_tensor->set_lod(in_tensor.lod());
  }

- private:
-  const OperatorBase& op_;
-  const Scope& scope_;
-};
-
-template <>
-const Tensor* InferShapeContext::Input<Tensor>(const std::string& name) const;
-
-template <>
-const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
-    const std::string& name) const;
-
-template <>
-Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const;
-
-template <>
-std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
-    const std::string& name) const;
-
-class ExecutionContext : public InferShapeContext {
- public:
-  ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
-      : InferShapeContext(op, scope), device_context_(device_context) {}
-
  template <typename PlaceType,
            typename DeviceType = typename platform::EigenDeviceConverter<
                PlaceType>::EigenDeviceType>
@@ -315,10 +290,26 @@ class ExecutionContext : public InferShapeContext {
  }

 private:
+  const OperatorBase& op_;
+  const Scope& scope_;
  const platform::DeviceContext& device_context_;
 };

-class CompileTimeInferShapeContext : public InferShapeContextBase {
+template <>
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
+    const std::string& name) const;
+
+template <>
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
+
+template <>
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+    const std::string& name) const;
+
+class CompileTimeInferShapeContext : public InferShapeContext {
 public:
  CompileTimeInferShapeContext(const OpDescBind& op, const BlockDescBind& block)
      : op_(op), block_(block) {}
@@ -414,7 +405,7 @@ class CompileTimeInferShapeContext : public InferShapeContextBase {
  const BlockDescBind& block_;
 };

-class RuntimeInferShapeContext : public InferShapeContextBase {
+class RuntimeInferShapeContext : public InferShapeContext {
 public:
  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
      : op_(op), scope_(scope) {}
@@ -612,7 +603,7 @@ class OperatorWithKernel : public OperatorBase {
                       });
  }

-  virtual void InferShape(InferShapeContextBase* ctx) const = 0;
+  virtual void InferShape(InferShapeContext* ctx) const = 0;

 protected:
  // indicate kernel DataType by input data. Defaultly all input data must be

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -113,7 +113,7 @@ class OpWithKernelTest : public OperatorWithKernel {
  using OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {}
+  void InferShape(framework::InferShapeContext* ctx) const override {}
  DataType IndicateDataType(const ExecutionContext& ctx) const override {
    return DataType::FP32;
  }

--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <vector>
 #include "paddle/framework/framework.pb.h"
 #include "paddle/platform/macros.h"
@@ -31,8 +32,6 @@ class ProgramDescBind {

  BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }

-  std::string DebugString() { return Proto()->DebugString(); }
-
  size_t Size() const { return blocks_.size(); }

  ProgramDesc *Proto();

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -20,11 +20,11 @@ namespace paddle {
 namespace framework {

 // TODO(longfei): Once after both CompileTimeInferShapeContext and
-// RuntimeInferShapeContext get merged, we can rename InferShapeContextBase into
+// RuntimeInferShapeContext get merged, we can rename InferShapeContext into
 // InferShapeContext so to replace the current InferShapeContext.
-class InferShapeContextBase {
+class InferShapeContext {
 public:
-  virtual ~InferShapeContextBase() {}
+  virtual ~InferShapeContext() {}
  virtual bool HasInput(const std::string &name) const = 0;
  virtual bool HasOutput(const std::string &name) const = 0;


--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -95,6 +95,19 @@ class Tensor {
  template <typename T>
  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);

+  /**
+   * @brief   Copy the content of an external vector to a tensor.
+   *
+   * @param[in] src   The external vector.
+   * @param[in] ctx   The device context contains place where to store.
+   *
+   * * @note    CopyFromVector assumes that the tensor has been resized
+   *            before invoking.
+   */
+  template <typename T>
+  inline void CopyFromVector(const std::vector<T>& src,
+                             const platform::Place& dst_place);
+
  /**
   * @brief   Return the slice of the tensor.
   *

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -123,6 +123,29 @@ inline void Tensor::CopyFrom(const Tensor& src,
 #endif
 }

+template <typename T>
+inline void Tensor::CopyFromVector(const std::vector<T>& src,
+                                   const platform::Place& dst_place) {
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size, 0);
+  }
+  PADDLE_ENFORCE(cudaStreamSynchronize(0),
+                 "cudaStreamSynchronize failed in Tensor CopyFromVector");
+
+#endif
+}
+
 template <typename T>
 inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
  check_memory_size<T>();

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -263,6 +263,93 @@ TEST(Tensor, CopyFrom) {
 #endif
 }

+TEST(Tensor, CopyFromVector) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::GPUPlace();
+    gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place);
+    // Copy from GPU to CPU tensor for comparison
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    // Compare Tensors
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    cpu_tensor.CopyFromVector<int>(src_vec, *cpu_place);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    gpu_tensor.CopyFromVector<int>(src_vec, *gpu_place);
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
 TEST(Tensor, ReshapeToMatrix) {
  using namespace paddle::framework;
  using namespace paddle::platform;

--- a/paddle/framework/type_defs.h
+++ b/paddle/framework/type_defs.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <functional>
 #include <map>
+#include <memory>
 #include "paddle/platform/variant.h"

 namespace paddle {

--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -162,4 +162,4 @@ int main(int argc, char** argv) {
  return RUN_ALL_TESTS();
 }

-#endif /* PADDLE_ONLY_CPU */
+#endif
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -182,7 +182,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
      max_chunk_size_ = platform::GpuMaxChunkSize();
    }
  }
-#endif  // PADDLE_ONLY_CPU
+#endif

  // Allocate a new maximum sized block
  size_t index = 0;

--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -134,7 +134,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {

 bool GPUAllocator::UseGpu() const { return true; }

-#endif  // PADDLE_ONLY_CPU
+#endif

 }  // namespace detail
 }  // namespace memory

--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -51,7 +51,7 @@ class GPUAllocator : public SystemAllocator {
  size_t gpu_alloc_size_ = 0;
  size_t fallback_alloc_size_ = 0;
 };
-#endif  // PADDLE_ONLY_CPU
+#endif

 }  // namespace detail
 }  // namespace memory

--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -62,4 +62,4 @@ TEST(GPUAllocator, Alloc) {
  TestAllocator(a, 2048);
  TestAllocator(a, 0);
 }
-#endif  // PADDLE_ONLY_CPU
+#endif
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -89,7 +89,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
  platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
 }

-#endif  // PADDLE_ONLY_CPU
+#endif

 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@@ -53,7 +53,7 @@ template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
          cudaStream_t stream);

-#endif  // PADDLE_ONLY_CPU
+#endif

 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -111,7 +111,7 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
  return GetGPUBuddyAllocator(place.device)->Used();
 }

-#endif  // PADDLE_ONLY_CPU
+#endif

 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -135,4 +135,4 @@ TEST(BuddyAllocator, GPUMultAlloc) {
  }
 }

-#endif  // PADDLE_ONLY_CPU
+#endif
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,12 +55,20 @@ function(op_library TARGET)
        set(pybind_flag 1)
    endif()

+    # pool_op contains several operators
    if ("${TARGET}" STREQUAL "pool_op")
        set(pybind_flag 1)
        # It's enough to just adding one operator to pybind
        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
    endif()

+    # pool_with_index_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_with_index_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
+    endif()
+
    # activation_op contains several operators
    if ("${TARGET}" STREQUAL "activation_op")
        set(pybind_flag 1)

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -22,7 +22,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Inference"),
                   "Input(Inference) of AccuracyOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"),

--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -22,7 +22,7 @@ class ActivationOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", /*->*/ "Y");
  }
@@ -33,7 +33,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
  }
 };
@@ -201,6 +201,40 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };

+template <typename AttrType>
+class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of ELU operator, it shouldn't be empty. Input "
+             "is flattened and treated as a 1D array.");
+    AddOutput("Y",
+              "(Tensor) The output of ELU operator. It has the same shape as "
+              "the input.");
+    AddAttr<AttrType>(
+        "alpha", "(float, default 1.0) Alpha value in the elu formulation.")
+        .SetDefault(static_cast<AttrType>(1.));
+    AddComment(R"DOC(
+        ELU activation operator. It applies this element-wise computation on
+        the input: f(x) = max(0, x) + min(0, alpha * (exp(x) - 1)).
+        Check .. _Link: https://arxiv.org/abs/1511.07289 for more details.)DOC");
+  }
+};
+
+template <typename AttrType>
+class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu6 operator");
+    AddOutput("Y", "Output of Relu6 operator");
+    AddComment("Relu6 activation operator, relu6 = min(max(0, x), 6)");
+    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
+        .SetDefault(static_cast<AttrType>(6));
+  }
+};
+
 template <typename AttrType>
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
@@ -276,6 +310,12 @@ REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
 REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
            soft_relu_grad, ops::ActivationOpGrad);

+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker<float>, elu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker<float>, relu6_grad,
+            ops::ActivationOpGrad);
+
 REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
            ops::ActivationOpGrad);


--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -280,6 +280,36 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+// relu6(x) = min(max(0, x), 6)
+template <typename T>
+struct Relu6Functor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.cwiseMax(static_cast<T>(0)).cwiseMin(threshold);
+  }
+};
+
+template <typename T>
+struct Relu6GradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy * ((x > static_cast<T>(0)) * (x < threshold)).template cast<T>();
+  }
+};
+
 // softsign(x) = x / (1 + |x|)
 template <typename T>
 struct SoftsignFunctor : public BaseActivationFunctor<T> {
@@ -354,6 +384,35 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
  }
 };

+template <typename T>
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) =
+        x.cwiseMax(static_cast<T>(0)) +
+        (alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy * (x > static_cast<T>(0)).template cast<T>() +
+        dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
+  }
+};
+
 template <typename T>
 struct PowFunctor : public BaseActivationFunctor<T> {
  float factor;
@@ -410,20 +469,22 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace operators
 }  // namespace paddle

-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                         \
-  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);          \
-  __macro(exp, ExpFunctor, ExpGradFunctor);                      \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                   \
-  __macro(tanh, TanhFunctor, TanhGradFunctor);                   \
-  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                   \
-  __macro(abs, AbsFunctor, AbsGradFunctor);                      \
-  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
-  __macro(log, LogFunctor, LogGradFunctor);                      \
-  __macro(square, SquareFunctor, SquareGradFunctor);             \
-  __macro(brelu, BReluFunctor, BReluGradFunctor);                \
-  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);      \
-  __macro(pow, PowFunctor, PowGradFunctor);                      \
-  __macro(stanh, STanhFunctor, STanhGradFunctor);                \
-  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);       \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);   \
-  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor)
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                          \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);           \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                       \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                    \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                    \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                    \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                       \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
+  __macro(log, LogFunctor, LogGradFunctor);                       \
+  __macro(square, SquareFunctor, SquareGradFunctor);              \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                 \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);       \
+  __macro(pow, PowFunctor, PowGradFunctor);                       \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                 \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);        \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
+  __macro(relu6, Relu6Functor, Relu6GradFunctor);                 \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
+  __macro(elu, ELUFunctor, ELUGradFunctor)
--- a/paddle/operators/adadelta_op.cc
+++ b/paddle/operators/adadelta_op.cc
@@ -22,7 +22,7 @@ class AdadeltaOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(Param) of AdadeltaOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Grad"),

--- a/paddle/operators/adagrad_op.cc
+++ b/paddle/operators/adagrad_op.cc
@@ -22,7 +22,7 @@ class AdagradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(Param) of AdagradOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Grad"),

--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adamax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InfNorm"),
+                   "Input(InfNorm) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamaxOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
+                   "Output(InfNormOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Beta1PowOut"),
+                   "Output(Beta1PowOut) of AdamaxOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("InfNorm"),
+        "Param and InfNorm input of AdamaxOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+    ctx->SetOutputDim("InfNormOut", param_dims);
+    ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
+  }
+};
+
+class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment", "(Tensor) First moment");
+    AddInput("InfNorm",
+             "(Tensor) "
+             "Input exponentially weighted infinity norm");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output first moment");
+    AddOutput("InfNormOut",
+              "(Tensor) "
+              "Output exponentially weighted infinity norm");
+    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "1st moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the weighted "
+                   "infinity norm estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddComment(R"DOC(
+Adamax Updates Operator.
+
+This implements the Adamax optimizer from Section 7 of the Adam
+paper[1]. Adamax is a variant of the
+Adam algorithm based on the infinity norm.
+
+Adamax updates:
+
+moment_out = beta1 * moment + (1 - beta1) * grad
+inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
+beta1_pow_out = beta1_pow * beta1
+learning_rate_t = learning_rate/(1 - beta1_pow_out)
+param_out = param - learning_rate_t * moment_out/inf_norm_out
+
+The original paper does not have an epsilon attribute.
+However, it is added here for numerical stability
+by preventing divide by 0.
+
+References:
+  [1] Adam: A Method for Stochastic Optimization
+      (https://arxiv.org/abs/1412.6980)
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
+REGISTER_OP_CPU_KERNEL(adamax,
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/adamax_op.cu
+++ b/paddle/operators/adamax_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adamax_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(adamax,
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/adamax_op.h
+++ b/paddle/operators/adamax_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdamaxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
+    auto beta1_pow_out_tensor = ctx.Output<framework::Tensor>("Beta1PowOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
+    beta1_pow_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float beta1 = ctx.Attr<float>("beta1");
+    float beta2 = ctx.Attr<float>("beta2");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto inf_norm = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("InfNorm"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+    auto beta1_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta1Pow"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto inf_norm_out =
+        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
+    auto beta1_pow_out =
+        framework::EigenVector<T>::Flatten(*beta1_pow_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(place) =
+        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
+    beta1_pow_out.device(place) = beta1_pow * beta1;
+    auto lr_t = lr / (1 - beta1_pow_out);
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -22,7 +22,7 @@ class ClipOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of ClipOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -61,7 +61,7 @@ class ClipOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -24,7 +24,7 @@ class ConcatOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
                      "Inputs(X) of ConcatOp should be empty.")
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -83,7 +83,7 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
  }
 };

--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -27,7 +27,7 @@ class Conv2DOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(Input) of Conv2DOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Filter"),
@@ -106,7 +106,7 @@ class Conv2DOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    auto in_dims = ctx->GetInputDim("Input");
    auto filter_dims = ctx->GetInputDim("Filter");
    if (ctx->HasOutput(framework::GradVarName("Input"))) {

--- a/paddle/operators/conv_shift_op.cc
+++ b/paddle/operators/conv_shift_op.cc
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_shift_op.h"
+#include "paddle/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class ConvShiftOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
+                      "The 2nd dimension of Input(Y) should be odd.");
+    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
+                      "The 2nd dimension of Input(Y) should be less than or "
+                      "equal to the 2nd dimension of Input(X).");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ConvShiftGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConvShiftOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+             "where B is the batch size and M is the data dimension.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x N, "
+             "where B is the batch size and N is the data dimension. N must "
+             "be odd.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+              "i.e., the same shape as X.");
+    AddComment(R"DOC(
+ConvShift Operator.
+
+A layer for circular convolution of two vectors,
+as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
+
+The equation is:
+
+  \f[
+      Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}
+  \f]
+
+where X's index is computed modulo M, and b's index is computed modulo N.
+
+Both of the input `X` and `Y` can carry LoD (Level of Details) information.
+However, the output only shares the LoD information with input `X`.
+)DOC");
+  }
+};
+
+template <typename T>
+class ConvShiftKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *Out = context.Output<Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto out = EigenMatrix<T>::From(*Out);
+    out.setZero();
+
+    size_t batch_size = X->dims()[0];
+    size_t x_width = X->dims()[1];
+    size_t y_width = Y->dims()[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    for (size_t k = 0; k < batch_size; ++k) {
+      for (size_t i = 0; i < x_width; ++i) {
+        for (size_t j = 0; j < y_width; ++j) {
+          int index = (i + j - y_half_width + x_width) % x_width;
+          out(k, i) += x(k, index) * y(k, j);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::CPUPlace, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto dout = EigenMatrix<T>::From(*dOut);
+
+    auto x_dims = X->dims();
+    auto y_dims = Y->dims();
+    size_t batch_size = x_dims[0];
+    size_t x_width = x_dims[1];
+    size_t y_width = y_dims[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    // The below trades code duplication for efficiency (keeping the if
+    // statement outside of the loop).
+    if (dX) {
+      dX->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*dX);
+      dx.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dx(k, index) += dout(k, i) * y(k, j);
+          }
+        }
+      }
+    }
+
+    if (dY) {
+      dY->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*dY);
+      dy.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dy(k, j) += x(k, index) * dout(k, i);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OP_CPU_KERNEL(conv_shift,
+                       ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/conv_shift_op.cu
+++ b/paddle/operators/conv_shift_op.cu
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_shift_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+namespace {
+
+inline int div_up(int x, int y) { return (x + y - 1) / y; }
+
+// Some notes on the design:
+//
+// Each thread is responsible for computing a single output out[k, i].
+// Thread blocks are based on tiles of x with height 1 in the batch dimension.
+//
+// This design is based on the typical use case where the filter
+// y is fairly small. For large y, it would probably be more efficient
+// to also tile across y.
+template <typename T>
+__global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width,
+                                   int y_width, int y_half_width,
+                                   int batch_size) {
+  extern __shared__ T mem[];
+
+  int tx = threadIdx.x;
+  int i = blockIdx.x * blockDim.x + tx;  // global x index
+  int k = blockIdx.y;                    // batch index
+
+  // Check if we are in a boundary block with fewer x's to process than
+  // blockDim.x.
+  int num_x =
+      (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x;
+
+  T *sx = mem;
+  T *sx_pad = &mem[num_x];
+  T *sy = &mem[blockDim.x + y_width];
+
+  // Collaboratively load y[k, :] and length-y padding of x into shared memory.
+  int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width;
+  for (int j = tx; j < y_width; j += blockDim.x) {
+    sy[j] = y[k * y_width + j];
+    sx_pad[j] = x[k * x_width + (pad_start + j) % x_width];
+  }
+
+  // Load a cyclically shifted slice of x into shared memory.
+  if (tx < num_x) {
+    int load_i = (i - y_half_width + x_width) % x_width;
+    sx[tx] = x[k * x_width + load_i];
+  } else {
+    return;
+  }
+  __syncthreads();
+
+  // Compute dot product of sx[tx:tx + y_width] and sy.
+  T sum = 0;
+  for (int j = 0; j < y_width; ++j) {
+    sum += sx[tx + j] * sy[j];
+  }
+
+  // Save to out[k, i].
+  out[k * x_width + i] = sum;
+}
+
+// Compute x gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width,
+                              int y_width, int y_half_width, int batch_size) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dx[k * x_width + index],
+              dout[k * x_width + i] * y[k * y_width + j]);
+  }
+}
+
+// Compute y gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void conv_shift_dy(const T *x, const T *dout, T *dy, int x_width,
+                              int y_width, int y_half_width, int batch_size) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dy[k * y_width + j],
+              x[k * x_width + index] * dout[k * x_width + i]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    Tensor *Out = context.Output<Tensor>("Out");
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    T *out_data = Out->mutable_data<T>(context.GetPlace());
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    const int x_per_block = 256;
+    int num_x_blocks = div_up(x_width, x_per_block);
+    int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
+
+    dim3 grid_dim(num_x_blocks, batch_size);
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      context.device_context())
+                      .stream();
+
+    conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
+        x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size);
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::GPUPlace, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    const T *dout_data = dOut->data<T>();
+
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    Tensor *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      context.device_context())
+                      .stream();
+
+    const int x_per_block = 256;
+    int num_x_blocks = div_up(x_width, x_per_block);
+    dim3 grid_dim(num_x_blocks, y_width, batch_size);
+
+    if (dX) {
+      T *dx_data = dX->mutable_data<T>(context.GetPlace());
+      cudaMemsetAsync(dx_data, 0, dX->numel() * sizeof(T), stream);
+      conv_shift_dx<T><<<grid_dim, x_per_block, 0, stream>>>(
+          dout_data, y_data, dx_data, x_width, y_width, y_half_width,
+          batch_size);
+    }
+    if (dY) {
+      T *dy_data = dY->mutable_data<T>(context.GetPlace());
+      cudaMemsetAsync(dy_data, 0, dY->numel() * sizeof(T), stream);
+      conv_shift_dy<T><<<grid_dim, x_per_block, 0, stream>>>(
+          x_data, dout_data, dy_data, x_width, y_width, y_half_width,
+          batch_size);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(conv_shift,
+                       ops::ConvShiftKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/conv_shift_op.h
+++ b/paddle/operators/conv_shift_op.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ConvShiftKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename Place, typename T>
+class ConvShiftGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -24,7 +24,7 @@ class CosSimOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    // notnull check
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of CosSimOp should not be null.");
@@ -98,7 +98,7 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    // notnull check
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");

--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -25,7 +25,7 @@ class CropOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of CropOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -115,7 +115,7 @@ class CropOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -22,7 +22,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
@@ -60,7 +60,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),

--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -24,7 +24,7 @@ class DropoutOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
    PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
@@ -70,7 +70,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), 1,
                      "GradOp is only callable when is_training is true");


--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -25,7 +25,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {

 protected:
  using Tensor = framework::Tensor;
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of elementwise op should not be null");
    PADDLE_ENFORCE(ctx->HasInput("Y"),
@@ -106,7 +106,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
  using Tensor = framework::Tensor;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),

--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_constant_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FillConstantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillConstantOp should not be null.");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto dims = framework::make_ddim(shape_int64);
+    ctx->SetOutputDim("Out", dims);
+  }
+
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    return static_cast<framework::DataType>(ctx.Attr<int>("dataType"));
+  }
+};
+
+class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillConstantOpMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("dataType",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::DataType::FP32);
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<float>("value", "(float, default 0) The value to be filled")
+        .SetDefault(0.0f);
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(Fill up a variable with specified constant value.)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
+                             ops::FillConstantOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_constant,
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/fill_constant_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/fill_constant_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    fill_constant,
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/fill_constant_op.h
+++ b/paddle/operators/fill_constant_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class FillConstantOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto value = ctx.Attr<T>("value");
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
+    auto place = ctx.GetEigenDevice<Place>();
+    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -22,7 +22,7 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of FillZerosLikeOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Y"),

--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -23,7 +23,7 @@ class GatherOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of GatherOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Index"),
@@ -51,7 +51,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }


--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -43,7 +43,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of GaussianRandomOp should not be null.");
    auto dims = ctx->Attrs().Get<std::vector<int>>("dims");

--- a/paddle/operators/interp_op.cc
+++ b/paddle/operators/interp_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class InterpOp : public NetOp {
+ public:
+  InterpOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
+                      "Input(X) of InterpOp should not be null.");
+    PADDLE_ENFORCE_NE(Input("Y"), framework::kEmptyVarName,
+                      "Input(Y) of InterpOp should not be null.");
+    PADDLE_ENFORCE_NE(Input("W"), framework::kEmptyVarName,
+                      "Input(W) of InterpOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("SubOut"), framework::kEmptyVarName,
+                      "Output(SubOut) of InterpOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("MulOut"), framework::kEmptyVarName,
+                      "Output(MulOut) of InterpOp should not be null.");
+    PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
+                      "Output(Out) of InterpOp should not be null.");
+
+    // SubOut = X - Y
+    auto x = Input("X");
+    auto y = Input("Y");
+    auto sub_out = Output("SubOut");
+    AppendOp(framework::OpRegistry::CreateOp(
+        "elementwise_sub", {{"X", {x}}, {"Y", {y}}}, {{"Out", {sub_out}}}, {}));
+
+    // MulOut = SubOut * W = (X - Y) * W
+    auto w = Input("W");
+    auto mul_out = Output("MulOut");
+    AppendOp(framework::OpRegistry::CreateOp(
+        "elementwise_mul", {{"X", {sub_out}}, {"Y", {w}}}, {{"Out", {mul_out}}},
+        {{"axis", 0}}));
+
+    // Out = MulOut + Y = (X - Y) * W + Y = X * W + Y * (1 - W)
+    AppendOp(framework::OpRegistry::CreateOp("elementwise_add",
+                                             {{"X", {mul_out}}, {"Y", {y}}},
+                                             {{"Out", {Output("Out")}}}, {}));
+
+    CompleteAddOp(false);
+  }
+};
+
+class InterpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  InterpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor), 2-D Matrix of shape [batch_size, data_dim]"
+             "containing data samples, the first input of interp_op");
+    AddInput("Y",
+             "(Tensor), 2-D Matrix of shape `[batch_size, data_dim]`"
+             "containing data samples, the second input of interp_op");
+    AddInput("W",
+             "(Tensor), 1-D Vector of shape [batch_size],"
+             "the interpolated values in the half-open interval [0.0, 1.0)");
+    AddOutput("SubOut",
+              "(Tensor), the intermediate subtraction outputs, saving X - Y.")
+        .AsIntermediate();
+    AddOutput("MulOut",
+              "(Tensor), the intermediate multiplication outputs,"
+              "saving the elementwise multiplication of (X - Y) and W.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(Tensor), the output of interp_op, same shape with X,"
+              "returns the first-dimensional piecewise linear interpolant "
+              "between X and Y");
+    AddComment(R"DOC(
+    Linear Interpolation with two inputs, used in NEURAL TURING MACHINE.
+
+    Equation:
+      Out.row[i] = X.row[i] * W[i] + Y.row[i] * (1 - W[i])
+                 = (X.row[i] - Y.row[i]) * W[i] + Y.row[i]
+
+    Example:
+      X = [[1,2],[3,4]],
+      Y = [[2,1],[4,3]],
+      W = [0.3, 0.4]
+
+      Then, Out = [[1.7,1.3],[3.6,3.4]]
+
+      where 1.7 = 1*0.3+2*(1-0.3),
+            1.3 = 2*0.3+1*(1-0.3),
+            3.6 = 3*0.4+4*(1-0.4),
+            3.4 = 4*0.4+3*(1-0.4)
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(interp, ops::InterpOp, ops::InterpOpMaker);
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -22,7 +22,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("W"),
                   "Input(W) of LookupTableOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Ids"),
@@ -70,7 +70,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    auto table_dims = ctx->GetInputDim("W");
    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
  }

--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -22,7 +22,7 @@ class LstmUnitOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
                   "Input(C_prev) of LSTM should not be null.");
@@ -77,7 +77,7 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
                   "Input(C@GRAD) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),

--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -18,6 +18,11 @@ namespace paddle {
 namespace operators {
 namespace math {

+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
@@ -73,6 +78,11 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
  }
 };

+/*
+* All tensors are in NCHW format.
+* Ksize, strides, paddings are two elements. These two elements represent height
+* and width, respectively.
+*/
 template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
@@ -135,6 +145,11 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  }
 };

+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
 template <class T>
 class MaxPool2dGradFunctor<platform::CPUPlace, T> {
 public:
@@ -197,7 +212,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
 };

 template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
-// template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool2dGradFunctor<platform::CPUPlace, double>;

 template class Pool2dFunctor<platform::CPUPlace,
                             paddle::operators::math::MaxPool<float>, float>;
@@ -216,6 +231,11 @@ template class Pool2dGradFunctor<
 template class Pool2dGradFunctor<
    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;

+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
@@ -286,6 +306,11 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
  }
 };

+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
@@ -364,6 +389,11 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  }
 };

+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
 template <class T>
 class MaxPool3dGradFunctor<platform::CPUPlace, T> {
 public:
@@ -440,7 +470,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
 };

 template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
-// template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+template class MaxPool3dGradFunctor<platform::CPUPlace, double>;

 template class Pool3dFunctor<platform::CPUPlace,
                             paddle::operators::math::MaxPool<float>, float>;
@@ -458,6 +488,253 @@ template class Pool3dGradFunctor<
    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
 template class Pool3dGradFunctor<
    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T ele = static_cast<T>(-FLT_MAX);
+            int index = -1;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (ele < input_data[h * input_width + w]) {
+                  ele = input_data[h * input_width + w];
+                  index = h * input_width + w;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = ele;
+            mask_data[ph * output_width + pw] = index;
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input_grad.dims()[0];
+    const int input_height = input_grad.dims()[2];
+    const int input_width = input_grad.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* mask_data = mask.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          for (int pw = 0; pw < output_width; ++pw) {
+            const int output_idx = ph * output_width + pw;
+            const int input_idx = static_cast<int>(mask_data[output_idx]);
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T>
+class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = static_cast<T>(-FLT_MAX);
+              int index = -1;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    if (ele < input_data[input_idx]) {
+                      index = input_idx;
+                      ele = input_data[input_idx];
+                    }
+                  }
+                }
+              }
+              output_data[output_idx] = ele;
+              mask_data[output_idx] = index;
+            }
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T>
+class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input_grad.dims()[0];
+    const int input_depth = input_grad.dims()[2];
+    const int input_height = input_grad.dims()[3];
+    const int input_width = input_grad.dims()[4];
+    const int output_channels = output_grad.dims()[1];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* mask_data = mask.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            for (int pw = 0; pw < output_width; ++pw) {
+              const int output_idx =
+                  (pd * output_height + ph) * output_width + pw;
+              const int input_idx = static_cast<int>(mask_data[output_idx]);
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+            }
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -21,15 +21,27 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
-//////////////////////
-#define FLT_MAX __FLT_MAX__  //

+#define FLT_MAX \
+  __FLT_MAX__  // It might need to be placed in another file, but I'm still
+               // wondering where to put it.
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
 template <class T>
 class MaxPool {
 public:
  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
-  DEVICE inline void finalize(T& y, const T& poo_size) {}
+  DEVICE inline void finalize(T& y, const T& pool_field) {}
 };

 template <class T>
@@ -37,8 +49,9 @@ class AvgPool {
 public:
  DEVICE inline T initial() { return static_cast<T>(0); }
  DEVICE inline void compute(T& y, const T& x) { y += x; }
-  DEVICE inline void finalize(T& y, const T& poo_size) { y /= poo_size; }
+  DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; }
 };
+
 template <class T>
 class MaxPoolGrad {
 public:
@@ -57,6 +70,20 @@ class AvgPoolGrad {
  }
 };

+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
+ * number of channels, H and W is the height and width of feature.
+ * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
+ * number of channels, D, H and W is the depth, height and width of feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
 template <typename Place, typename PoolProcess, typename T>
 class Pool2dFunctor {
 public:
@@ -117,6 +144,51 @@ class MaxPool3dGradFunctor {
                  std::vector<int>& strides, std::vector<int>& paddings);
 };

+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <typename Place, typename T>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename T>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename T>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename T>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -22,7 +22,7 @@ class MeanOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of MeanOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -47,7 +47,7 @@ class MeanGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }
 };

--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -26,7 +26,7 @@ class MinusOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of MinusOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Y"),

--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -22,7 +22,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");

@@ -74,7 +74,7 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
    PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"),

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -24,7 +24,7 @@ class MulOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -97,7 +97,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),

--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -24,7 +24,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
    PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
                   "MultiInput(X) shouldn't be empty.");
@@ -90,7 +90,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
    PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
                   "Output(X@Grad) should not be null.");

--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <set>
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/op_registry.h"


--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -24,7 +24,7 @@ class PadOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of PadOp should not be null.");
@@ -98,7 +98,7 @@ class PadOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");

--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -27,7 +27,7 @@ class PoolOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "X(Input) of Pooling should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -74,7 +74,7 @@ class PoolOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "X(Input) of Pooling should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_with_index_op.h"
+
+namespace paddle {
+namespace operators {
+
+inline int OutputSizeMaxPool(int input_size, int filter_size, int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Out(Output) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mask"),
+                   "Mask(Output) of Pooling should not be null.");
+
+    auto in_x_dims = ctx->GetInputDim("X");
+
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                   "Pooling intput should be 4-D or 5-D");
+
+    if (ctx->Attrs().Get<bool>("globalPooling")) {
+      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
+
+    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                   "Intput size and pooling size should be consistent.");
+    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                      "Strides size and pooling size should be the same.");
+    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                      "Paddings size and pooling size should be the same.");
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(OutputSizeMaxPool(in_x_dims[i + 2], ksize[i],
+                                               paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
+  }
+};
+
+class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool2dWithIndexOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "The input tensor of pooling operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of image.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of image.");
+    AddOutput("Mask",
+              "The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is the number of channels, H and W "
+              "is the height and width of image."
+              "The value in it is the index in current feature map");
+
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "The pooling size(height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                        // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "Strides(height, width) of pooling operator."
+                              "Default {1,1}.")
+        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                              // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>("paddings",
+                              "Paddings(height, width) of pooling operator."
+                              "Default {0,0}.")
+        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                              // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+The maxPooling2d with index operation calculates the output and the mask
+based on the input and ksize, strides, paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format. Where N is batch size, C is the
+number of channels, H and W is the height and width of feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+)DOC");
+  }
+};
+
+class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "The input tensor of pooling operator. "
+        "The format of input tensor is NCDHW. Where N is batch size, C is "
+        "the number of channels, D, H and W is the depth, height and width of "
+        "image.");
+    AddOutput("Out",
+              "The output tensor of pooling operator."
+              "The format of output tensor is also NCDHW."
+              "Where N is batch size, C is "
+              "the number of channels, D, H and W is the depth, height and "
+              "width of image.");
+    AddOutput("Mask",
+              "The Mask tensor of pooling operator."
+              "The format of output tensor is also NCDHW."
+              "Where N is batch size, C is the number of channels, D, H and W "
+              "is the depth, height and width of image."
+              "The value in it is the index in current feature map");
+
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "The pooling size(depth, height, width) of pooling operator."
+        "If globalPooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                        // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "globalPooling",
+        "Whether to use the globalPooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If globalPooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "strides",
+        "Strides(depth, height, width) of pooling operator."
+        "Default {1,1,1}.")
+        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                                 // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "Paddings(depth, height, width) of pooling operator."
+        "Default {0,0,0}.")
+        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                                 // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+The maxpooling3d with index operation calculates the output and the mask
+based on the input and ksize, strides, paddings parameters.
+Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch
+size, C is the number of channels, D, H and W is the depth, height and
+width of feature. Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
+
+REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
--- a/paddle/operators/pool_with_index_op.cu
+++ b/paddle/operators/pool_with_index_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_with_index_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
+
+REGISTER_OP_GPU_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    Tensor* mask = context.Output<Tensor>("Mask");
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+
+    switch (ksize.size()) {
+      case 2: {
+        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
+            pool2d_forward;
+        pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize,
+                       strides, paddings);
+      } break;
+      case 3: {
+        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
+            pool3d_forward;
+        pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
+                       strides, paddings);
+      } break;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* mask = context.Input<Tensor>("Mask");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
+      }
+    }
+
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
+      temp.device(context.GetEigenDevice<Place>()) =
+          temp.constant(static_cast<T>(0));
+
+      switch (ksize.size()) {
+        case 2: {
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
+              pool2d_backward;
+          pool2d_backward(context.device_context(), *in_x_grad, *out_grad,
+                          *mask, ksize, strides, paddings);
+        } break;
+        case 3: {
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
+              pool3d_backward;
+          pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
+                          *mask, ksize, strides, paddings);
+        } break;
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -26,7 +26,7 @@ class PReluOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
@@ -63,7 +63,7 @@ class PReluGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");

--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -25,7 +25,7 @@ class RankLossOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    // input check
    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null");
    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null");
@@ -90,7 +90,7 @@ class RankLossGradOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");

--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -24,7 +24,7 @@ class ReduceOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of ReduceOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -58,7 +58,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null.");

--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -26,7 +26,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    // input check
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of ReshapeOp should not be null.");
@@ -94,7 +94,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) shouldn't be null.");

--- a/paddle/operators/rmsprop_op.cc
+++ b/paddle/operators/rmsprop_op.cc
@@ -22,7 +22,7 @@ class RmspropOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(Param) of RmspropOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),

--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -26,7 +26,7 @@ class ScaleOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of ScaleOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),

--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -23,7 +23,7 @@ class ScatterOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Ref"),
                   "Input(Ref) of ScatterOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Index"),
@@ -60,7 +60,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    ctx->SetOutputDim(framework::GradVarName("Updates"),
                      ctx->GetInputDim("Updates"));
    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));

--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -22,7 +22,7 @@ class SequencePoolOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of SequencePoolOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -74,7 +74,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Gradient of Out should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");

--- a/paddle/operators/sequence_softmax_op.cc
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -22,7 +22,7 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of SequenceSoftmaxOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -67,7 +67,7 @@ class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Out"),
                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
    PADDLE_ENFORCE(

--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -22,7 +22,7 @@ class SGDOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Param"),
                   "Input(Param) of SGDOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Grad"),

--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -24,7 +24,7 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput("Labels"),
                   "Input(Labels) should be not null.");
@@ -53,7 +53,7 @@ class SigmoidCrossEntropyWithLogitsGradOp
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput("Labels"),
                   "Input(Labels) should be not null.");

--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,7 +22,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");

@@ -94,7 +94,7 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    auto in_dims = ctx->GetInputDim("X");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));


--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -22,7 +22,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of SoftmaxOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Y"),
@@ -69,7 +69,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
                   "Input(Y@GRAD) should be not null.");

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -83,7 +83,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Logits"),
                   "Input(Logits) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
@@ -128,7 +128,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
                   "Input(Loss@Grad) should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Softmax"),

--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -24,7 +24,7 @@ class SplitOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of SplitOp should not be null.");
    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,

--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -22,7 +22,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of SquaredL2DistanceOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Y"),
@@ -86,7 +86,7 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Gradient of Out should not be null");
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -22,7 +22,7 @@ class SumOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
    auto x_dims = ctx->GetInputsDim("X");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),

--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -22,7 +22,7 @@ class TopkOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of TopkOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),

--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -24,7 +24,7 @@ class TransposeOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
    auto x_dims = ctx->GetInputDim("X");
@@ -93,7 +93,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -47,7 +47,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

 protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of UniformRandomOp should not be null.");


--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -136,7 +136,7 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }

 cudaStream_t CUDADeviceContext::stream() const { return stream_; }

-#endif  // PADDLE_ONLY_CPU
+#endif

 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -41,7 +41,7 @@ limitations under the License. */
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>

-#endif  // PADDLE_ONLY_CPU
+#endif

 namespace paddle {
 namespace platform {

--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -63,4 +63,4 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
 }  // namespace platform
 }  // namespace paddle

-#endif  // PADDLE_ONLY_CPU
+#endif
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -117,7 +117,6 @@ void BindProgramDesc(py::module &m) {
      .def("append_block", &ProgramDescBind::AppendBlock,
           py::return_value_policy::reference)
      .def("block", &ProgramDescBind::Block, py::return_value_policy::reference)
-      .def("__str__", &ProgramDescBind::DebugString)
      .def("num_blocks", &ProgramDescBind::Size);
 }

@@ -191,15 +190,14 @@ void BindOpDesc(py::module &m) {
      .def("output", &OpDescBind::Output)
      .def("output_names", &OpDescBind::OutputNames)
      .def("set_output", &OpDescBind::SetOutput)
-      .def("__str__", &OpDescBind::DebugString)
-      .def("__repr__", &OpDescBind::DebugString)
      .def("has_attr", &OpDescBind::HasAttr)
      .def("attr_type", &OpDescBind::GetAttrType)
      .def("attr_names", &OpDescBind::AttrNames)
      .def("set_attr", &OpDescBind::SetAttr)
      .def("attr", &OpDescBind::GetAttr)
      .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("get_block_attr", &OpDescBind::GetBlockAttr);
+      .def("get_block_attr", &OpDescBind::GetBlockAttr)
+      .def("infer_shape", &OpDescBind::InferShape);
 }

 }  // namespace pybind

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -231,21 +231,6 @@ All parameter, weight, gradient are variables in Paddle.
                                   desc.InitializationErrorString());
                    return OpRegistry::CreateOp(desc);
                  })
-      .def_static("infer_shape",
-                  [](OpDescBind &op_desc, BlockDescBind &block) {
-                    auto op = OpRegistry::CreateOp(*op_desc.Proto());
-                    auto *op_with_kernel =
-                        dynamic_cast<OperatorWithKernel *>(op.get());
-                    if (op_with_kernel != nullptr) {
-                      auto ctx = CompileTimeInferShapeContext(op_desc, block);
-                      op_with_kernel->InferShape(&ctx);
-                    } else {
-                      PADDLE_THROW(
-                          "OP(%s) is not type of OperatorWithKernel, "
-                          "should not call this function",
-                          op_desc.Type());
-                    }
-                  })
      .def("backward",
           [](const OperatorBase &forwardOp,
              const std::unordered_set<std::string> &no_grad_vars) {

--- a/python/paddle/v2/framework/graph.py
+++ b/python/paddle/v2/framework/graph.py
+import paddle.v2.framework.core as core
+import collections
+
+__all__ = ['Block', 'Variable', 'Program', 'Operator']
+
+
+class Variable(object):
+    def __init__(self, block, name=None, shape=None, dtype=None,
+                 lod_level=None):
+        self.block = block
+
+        if name is None:
+            name = Variable._unique_var_name_()
+        self.proto = self.block.proto.new_var(name)
+
+        if shape is not None:
+            self.proto.set_shape(shape)
+
+        if dtype is not None:
+            # TODO(yuyang18): Convert dtype from numpy.dtype
+            self.proto.set_data_type(dtype)
+
+        if lod_level is not None:
+            # TODO(yuyang18): set_lod_level is not defined.
+            self.proto.set_lod_level(lod_level)
+
+        self.block.vars[name] = self
+        self.op = None
+
+    # TODO(yuyang18): Get methods
+
+    @staticmethod
+    def _unique_var_name_():
+        uid = core.unique_integer()  # unique during whole process.
+        return "_generated_var_%d" % uid
+
+
+class Operator(object):
+    def __init__(self,
+                 block,
+                 proto,
+                 type=None,
+                 inputs=None,
+                 outputs=None,
+                 attrs=None):
+        self.block = block
+        self.proto = proto
+        if type is not None:
+            # TODO.
+            pass
+        if inputs is not None:
+            # TODO
+            pass
+        if outputs is not None:
+            # TODO
+            pass
+        if attrs is not None:
+            # TODO
+            pass
+
+        # TODO: Getters
+
+
+class Block(object):
+    def __init__(self, program, idx):
+        self.proto = program.proto.block(idx)
+        self.vars = dict()  # var_name --> var
+        self.ops = collections.deque()  # operator list
+        self.program = program
+
+    @property
+    def parent_idx(self):
+        return self.proto.parent
+
+    @property
+    def idx(self):
+        return self.proto.id
+
+    def create_var(self, *args, **kwargs):
+        return Variable(self, *args, **kwargs)
+
+    def append_op(self, *args, **kwargs):
+        op_proto = self.proto.append_op()
+        op = Operator(self, op_proto, *args, **kwargs)
+        self.ops.append(op)
+        return op
+
+    def prepend_op(self, *args, **kwargs):
+        op_proto = self.proto.prepend_op()
+        op = Operator(self, op_proto, *args, **kwargs)
+        self.ops.appendleft(op)
+        return op
+
+
+class Program(object):
+    @classmethod
+    def instance(cls):
+        # From https://stackoverflow.com/questions/8212053
+        # Making Program as a Singleton class.
+        if not hasattr(cls, '_instance'):
+            cls._instance = cls()
+        return cls._instance
+
+    def __init__(self):
+        assert not hasattr(self.__class__,
+                           '_instance'), 'Do not call constructor directly!'
+        self.proto = core.ProgramDesc.instance()
+        self.blocks = [Block(self, 0)]
+        self.current_block_idx = 0
+
+    def global_block(self):
+        return self.blocks[0]
+
+    def current_block(self):
+        return self.blocks[self.current_block_idx]
+
+    def create_block(self):
+        new_block_idx = len(self.blocks)
+        self.proto.append_block(self.current_block().proto)
+        self.current_block_idx = new_block_idx
+        self.blocks.append(Block(self, self.current_block_idx))
+        return self.current_block()
+
+    def rollback(self):
+        self.current_block_idx = self.current_block().parent_idx
+
+
+# program is a global instance.
+g_program = Program.instance()
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -137,21 +137,26 @@ class TestBRelu(OpTest):
        self.check_grad(['X'], 'Y', max_relative_error=0.02)


-class TestLeakyRelu(OpTest):
+class TestRelu6(OpTest):
    def setUp(self):
-        self.op_type = "leaky_relu"
-        alpha = 0.02
-        self.attrs = {'alpha': alpha}
-        self.inputs = {'X': np.random.uniform(-3, 3, [4, 4]).astype("float32")}
+        self.op_type = "relu6"
+        x = np.random.uniform(-1, 1, [4, 10]).astype("float32")
+        threshold = 6.0
+        # The same with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': threshold}
        self.outputs = {
-            'Y': np.maximum(self.inputs['X'], alpha * self.inputs['X'])
+            'Y': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
        }

    def test_check_output(self):
        self.check_output()

    def test_check_grad(self):
-        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)


 class TestSoftRelu(OpTest):
@@ -176,6 +181,26 @@ class TestSoftRelu(OpTest):
        self.check_grad(['X'], 'Y', max_relative_error=0.02)


+class TestELU(OpTest):
+    def setUp(self):
+        self.op_type = "elu"
+        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        alpha = 1.
+        # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
+        # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
+        self.inputs = {'X': x}
+        self.attrs = {'alpha': alpha}
+        self.outputs = {
+            'Y': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+
+
 class TestReciprocal(OpTest):
    def setUp(self):
        self.op_type = "reciprocal"

--- a/python/paddle/v2/framework/tests/test_adamax_op.py
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdamaxOp1(OpTest):
+    def setUp(self):
+        '''Test Adamax Operator with supplied attributes
+        '''
+        self.op_type = "adamax"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.78
+        beta2 = 0.899
+        epsilon = 1e-5
+        beta1_pow = beta1**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+
+        param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
+            self.inputs, self.attrs)
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'InfNormOut': inf_norm_out,
+            'Beta1PowOut': beta1_pow_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamaxOp2(OpTest):
+    '''Test Adamax Operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adamax"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+        param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
+            self.inputs, attrs)
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'InfNormOut': inf_norm_out,
+            'Beta1PowOut': beta1_pow_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamaxOpMultipleSteps(OpTest):
+    def setUp(self):
+        '''Test Adamax Operator with supplied attributes
+        '''
+        self.op_type = "adamax"
+        self.num_steps = 10
+
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.8
+        beta2 = 0.99
+        epsilon = 1e-5
+        beta1_pow = 1
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+
+        param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
+            self.inputs, self.attrs)
+
+    def test_check_output(self):
+        for _ in range(self.num_steps):
+            param_out, moment_out, inf_norm_out, beta1_pow_out = adamax_step(
+                self.inputs, self.attrs)
+
+            self.outputs = {
+                'ParamOut': param_out,
+                'MomentOut': moment_out,
+                'InfNormOut': inf_norm_out,
+                'Beta1PowOut': beta1_pow_out
+            }
+
+            # Verify output for this step
+            self.check_output()
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = param_out
+            self.inputs['Moment'] = moment_out
+            self.inputs['InfNorm'] = inf_norm_out
+            self.inputs['Beta1Pow'] = beta1_pow_out
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(
+                -1, 1, (102, 105)).astype("float32")
+
+
+def adamax_step(inputs, attributes):
+    '''
+    Simulate one step of the adamax optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment, inf_norm and
+    beta1 power accumulator
+    '''
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment = inputs['Moment']
+    inf_norm = inputs['InfNorm']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment_out = beta1 * moment + (1 - beta1) * grad
+    inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad))
+    beta1_pow_out = beta1_pow * beta1
+    lr_t = (lr / (1 - beta1_pow_out))
+    param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
+
+    return param_out, moment_out, inf_norm_out, beta1_pow_out
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/framework/tests/test_conv_shift_op.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_op.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_interp_op.py
+++ b/python/paddle/v2/framework/tests/test_interp_op.py
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py