Pool2d cuda kernel supports fp16 (#28316)

* pool2d cuda kernel supports fp16 * fix compile issue of template * add ut

Pool2d cuda kernel supports fp16 (#28316)
* pool2d cuda kernel supports fp16 * fix compile issue of template * add ut
6115c14f · Leo Chen · GitHub · f41104ef · 6115c14f · 6115c14f
7 changed file
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"

@@ -126,7 +127,7 @@ __global__ void KernelPool2DGrad(
      phend = min(h_offset / stride_height + 1, output_height);
      pwend = min(w_offset / stride_width + 1, output_width);
    }
-    T gradient = 0;
+    T gradient = static_cast<T>(0.0);
    T input = input_data[index];

    int output_stride;
@@ -264,12 +265,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 }

 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -351,12 +352,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  }
 };
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, typename T>
 class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -448,12 +449,12 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 };

 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename T>
 class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
 public:
@@ -549,6 +550,8 @@ template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,

 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext,
+                                    paddle::platform::float16>;

 template class Pool2dFunctor<platform::CUDADeviceContext,
                             paddle::operators::math::MaxPool<float>, float>;
@@ -571,6 +574,23 @@ template class Pool2dGradFunctor<platform::CUDADeviceContext,
                                 paddle::operators::math::AvgPoolGrad<double>,
                                 double>;

+template class Pool2dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool2dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool2dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool2dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3D(
    const int nthreads, const T* input_data, const int channels,
@@ -712,7 +732,7 @@ __global__ void KernelPool3DGrad(
      pwend = min((w_offset) / stride_width + 1, output_width);
    }

-    T gradient = 0;
+    T gradient = static_cast<T>(0.0);
    T input = input_data[index];

    int output_stride;
@@ -848,13 +868,13 @@ __global__ void KernelMaxPool3DGrad(
 }

 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -952,13 +972,13 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 };

 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 public:
@@ -1064,13 +1084,13 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 };

 /*
-* tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <class T>
 class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
 public:
@@ -1174,6 +1194,8 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {

 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext,
+                                    paddle::platform::float16>;

 template class Pool3dFunctor<platform::CUDADeviceContext,
                             paddle::operators::math::MaxPool<float>, float>;
@@ -1196,6 +1218,23 @@ template class Pool3dGradFunctor<platform::CUDADeviceContext,
                                 paddle::operators::math::AvgPoolGrad<double>,
                                 double>;

+template class Pool3dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool3dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool3dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool3dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+
 template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
    const int nthreads, const T1* input_data, const int channels,

--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,7 +57,7 @@ class MaxPoolGrad {
 public:
  DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
                             T* dx) {
-    *dx += dy * (x == y);
+    *dx += dy * static_cast<T>(x == y);
  }
 };


--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/pool_op.h"
+
 #include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -219,11 +220,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
 #endif

  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                      platform::errors::InvalidArgument(
-                          "Float16 can only be used when CUDNN is used"));
-  }
+
  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                 library_);
 }

--- a/paddle/fluid/operators/pool_op.cu.cc
+++ b/paddle/fluid/operators/pool_op.cu.cc
@@ -18,16 +18,24 @@ namespace ops = paddle::operators;

 REGISTER_OP_CUDA_KERNEL(
    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
    pool2d_grad,
    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>);

 REGISTER_OP_CUDA_KERNEL(
    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
    pool3d_grad,
    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>);
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -257,7 +258,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
    if (in_x_grad) {
      in_x_grad->mutable_data<T>(context.GetPlace());
      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, in_x_grad, 0.0);
+      set_constant(dev_ctx, in_x_grad, static_cast<T>(0.0));

      switch (ksize.size()) {
        case 2: {

--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -475,6 +475,41 @@ def create_test_cudnn_fp16_class(parent, check_grad=True):
    globals()[cls_name] = TestCUDNNFp16Case


+def create_test_fp16_class(parent, check_grad=True):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestFp16Case(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = False
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(
+                        place,
+                        atol=1e-3,
+                        check_dygraph=(self.use_mkldnn == False))
+
+        def test_check_grad(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(
+                    place) and self.pool_type != "max" and check_grad:
+                self.check_grad_with_place(
+                    place,
+                    set(['X']),
+                    'Out',
+                    max_relative_error=0.07,
+                    check_dygraph=(self.use_mkldnn == False))
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
+    TestFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestFp16Case
+
+
 create_test_cudnn_fp16_class(TestPool2D_Op)
 create_test_cudnn_fp16_class(TestCase1, check_grad=False)
 create_test_cudnn_fp16_class(TestCase2)
@@ -482,6 +517,13 @@ create_test_cudnn_fp16_class(TestCase3)
 create_test_cudnn_fp16_class(TestCase4)
 create_test_cudnn_fp16_class(TestCase5)

+create_test_fp16_class(TestPool2D_Op)
+create_test_fp16_class(TestCase1, check_grad=False)
+create_test_fp16_class(TestCase2)
+create_test_fp16_class(TestCase3)
+create_test_fp16_class(TestCase4)
+create_test_fp16_class(TestCase5)
+
 #--------------------test pool2d use ceil mode--------------------



--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -405,6 +405,25 @@ def create_test_cudnn_fp16_class(parent):
    globals()[cls_name] = TestCUDNNFp16Case


+def create_test_fp16_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestFp16Case(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = False
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=1e-2)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
+    TestFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestFp16Case
+
+
 create_test_cudnn_fp16_class(TestPool3D_Op)
 create_test_cudnn_fp16_class(TestCase1)
 create_test_cudnn_fp16_class(TestCase2)
@@ -412,6 +431,13 @@ create_test_cudnn_fp16_class(TestCase3)
 create_test_cudnn_fp16_class(TestCase4)
 create_test_cudnn_fp16_class(TestCase5)

+create_test_fp16_class(TestPool3D_Op)
+create_test_fp16_class(TestCase1)
+create_test_fp16_class(TestCase2)
+create_test_fp16_class(TestCase3)
+create_test_fp16_class(TestCase4)
+create_test_fp16_class(TestCase5)
+

 # ---- test ceil mode ------
 def create_test_cudnn_use_ceil_class(parent):