From 1bb0e2943b13b1d65da65c99897105f665ae09d7 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 11 Oct 2017 19:02:40 +0800
Subject: [PATCH] Add pool2d cudnn

---
 paddle/framework/operator.h                   |   9 +
 paddle/operators/CMakeLists.txt               |   7 +
 paddle/operators/pool_cudnn_op.cc             |  34 ++++
 paddle/operators/pool_cudnn_op.cu             | 174 ++++++++++++++++++
 paddle/operators/pool_cudnn_op.h              |  22 +++
 .../framework/tests/test_pool2d_cudnn_op.py   | 144 +++++++++++++++
 6 files changed, 390 insertions(+)
 create mode 100644 paddle/operators/pool_cudnn_op.cc
 create mode 100644 paddle/operators/pool_cudnn_op.cu
 create mode 100644 paddle/operators/pool_cudnn_op.h
 create mode 100644 python/paddle/v2/framework/tests/test_pool2d_cudnn_op.py
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 15f80b57206..5db637abbc9 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -289,6 +289,15 @@ class ExecutionContext {
     return device_context_;
   }
 
+#ifdef PADDLE_WITH_CUDA
+  const platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    auto cuda_ctx =
+        reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_);
+    return *cuda_ctx;
+  }
+#endif  // PADDLE_WITH_CUDA
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index ad941bde2be..e2a8615f908 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -69,6 +69,13 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
     endif()
 
+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+    endif()
+
     # activation_op contains several operators
     if ("${TARGET}" STREQUAL "activation_op")
         set(pybind_flag 1)
diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc
new file mode 100644
index 00000000000..8307561194e
--- /dev/null
+++ b/paddle/operators/pool_cudnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_cudnn_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool2d_cudnn,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+
+// REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad,
+//            ops::PoolOpGrad);
+//
+// REGISTER_OP_CPU_KERNEL(pool3d_cudnn,
+//                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+// REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad,
+//                       ops::PoolGradKernel<paddle::platform::CPUPlace,
+//                       float>);
diff --git a/paddle/operators/pool_cudnn_op.cu b/paddle/operators/pool_cudnn_op.cu
new file mode 100644
index 00000000000..c5c9bf73b97
--- /dev/null
+++ b/paddle/operators/pool_cudnn_op.cu
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_cudnn_op.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
+using DataLayout = platform::DataLayout;
+using PoolingMode = platform::PoolingMode;
+
+// NOTE: copy from conv_cudnn
+std::vector<int> Dims2Vector(const framework::DDim &dims) {
+  std::vector<int> ret;
+  for (int i = 0; i < dims.size(); i++) {
+    ret.push_back(dims[i]);
+  }
+  return ret;
+}
+
+template <typename T>
+class PoolCudnnOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    const Tensor *input = ctx.Input<Tensor>("X");
+    Tensor *output = ctx.Output<Tensor>("Out");
+
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
+    }
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    cudnnTensorDescriptor_t cudnn_input_desc =
+        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc =
+        output_desc.descriptor<T>(layout, Dims2Vector(output->dims()));
+
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+
+    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        cudnn_output_desc, output_data));
+  }
+};
+
+template <typename T>
+class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+
+    const Tensor *input = ctx.Input<Tensor>("X");
+    const Tensor *output = ctx.Input<Tensor>("Out");
+    const Tensor *output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+
+    if (ctx.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+    }
+
+    const T *input_data = input->data<T>();
+    const T *output_data = output->data<T>();
+    const T *output_grad_data = output_grad->data<T>();
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedTensorDescriptor input_grad_desc;
+    ScopedTensorDescriptor output_grad_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    cudnnTensorDescriptor_t cudnn_input_desc =
+        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc =
+        output_desc.descriptor<T>(layout, Dims2Vector(output->dims()));
+    cudnnTensorDescriptor_t cudnn_output_grad_desc =
+        output_grad_desc.descriptor<T>(layout,
+                                       Dims2Vector(output_grad->dims()));
+
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+
+    if (input_grad) {
+      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*input_grad);
+      temp.device(ctx.GetEigenDevice<paddle::platform::GPUPlace>()) =
+          temp.constant(static_cast<T>(0));
+
+      cudnnTensorDescriptor_t cudnn_input_grad_desc =
+          input_grad_desc.descriptor<T>(layout,
+                                        Dims2Vector(input_grad->dims()));
+
+      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
+          cudnn_output_grad_desc, output_grad_data, cudnn_input_desc,
+          input_data, &beta, cudnn_input_grad_desc, input_grad_data));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>);
+//
+// REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel<float>);
+// REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>);
diff --git a/paddle/operators/pool_cudnn_op.h b/paddle/operators/pool_cudnn_op.h
new file mode 100644
index 00000000000..8940967ab7a
--- /dev/null
+++ b/paddle/operators/pool_cudnn_op.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/pool_op.h"
+
+namespace paddle {
+namespace operators {}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/framework/tests/test_pool2d_cudnn_op.py b/python/paddle/v2/framework/tests/test_pool2d_cudnn_op.py
new file mode 100644
index 00000000000..8180468014b
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool2d_cudnn_op.py
@@ -0,0 +1,144 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+    return out
+
+
+def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
+                (r_end - r_start) * (c_end - c_start))
+    return out
+
+
+class TestPool2d_cudnn_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'poolingType': self.pool_type,
+            'globalPooling': self.global_pool,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d_cudnn"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase1(TestPool2d_cudnn_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d_cudnn"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase2(TestPool2d_cudnn_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d_cudnn"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+class TestCase3(TestPool2d_cudnn_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d_cudnn"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase4(TestPool2d_cudnn_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d_cudnn"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase5(TestPool2d_cudnn_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d_cudnn"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab