From e4de5dc3473f3a319ad6dabb2c665550c69e4434 Mon Sep 17 00:00:00 2001
From: Kexin Zhao <zhaokexin01@baidu.com>
Date: Thu, 15 Mar 2018 14:54:28 -0700
Subject: [PATCH] add conv2d fp16 support

---
 paddle/fluid/operators/conv_cudnn_op.cu.cc    | 16 ++++---
 paddle/fluid/operators/conv_op.cc             | 17 +++++--
 paddle/fluid/platform/cudnn_helper.h          | 16 +++++++
 .../paddle/fluid/tests/unittests/op_test.py   | 25 +++++++++++
 .../fluid/tests/unittests/test_conv2d_op.py   | 44 ++++++++++++++++---
 5 files changed, 104 insertions(+), 14 deletions(-)
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index ff0fbf21f86..aec0aba6f7b 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -133,7 +134,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv forward ---------------------
-    T alpha = 1.0f, beta = 0.0f;
+    T alpha = static_cast<T>(1.0f);
+    T beta = static_cast<T>(0.0f);
     for (int i = 0; i < groups; i++) {
       PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
           handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
@@ -315,16 +317,18 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+namespace plat = paddle::platform;
+REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<double>);
-REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<double>,
+                   paddle::operators::CUDNNConvOpKernel < plat::float16);
+REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
                    paddle::operators::CUDNNConvGradOpKernel<double>);
 
-REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
                    paddle::operators::CUDNNConvOpKernel<double>);
-REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
                    paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 4b02b80d777..e3fc21c90f9 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -83,12 +83,23 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
   }
 #endif
 
+  auto input_data_type =
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type());
+  auto filter_data_type =
+      framework::ToDataType(ctx.Input<Tensor>("Filter")->type());
+  PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
+                    "input and filter data type should be consistent");
+
+  if (input_data_type == framework::proto::VarType::FP16) {
+    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
+                      "float16 can only be used when CUDNN is used");
+  }
+
   std::string data_format = ctx.Attr<std::string>("data_format");
   // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
   framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout_, library_);
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
+                                 library_);
 }
 
 Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 9a2ac3ff33d..510a1707ba8 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
@@ -80,6 +81,21 @@ enum class PoolingMode {
 template <typename T>
 class CudnnDataType;
 
+template <>
+class CudnnDataType<float16> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_HALF;
+  typedef const float16 ScalingParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = static_cast<float16>(1.0);
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = static_cast<float16>(0.0);
+    return &v;
+  }
+};
+
 template <>
 class CudnnDataType<float> {
  public:
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index f7e02595ec3..6d4684b024d 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -469,6 +469,31 @@ class OpTest(unittest.TestCase):
             tensor.set_lod(lod)
         return tensor
 
+    @staticmethod
+    def create_view(input):
+        """Create a view of the input numpy array
+
+        numpy float16 is binded to paddle::platform::float16 
+        in tensor_py.h via the help of numpy uint16 because
+        the internal memory representation of float16 is 
+        uint16_t in paddle or np.uint16 in numpy, which are
+        themselves binded together.
+
+        Args:
+            input: input numpy array
+
+        Returns:
+            input_view: if the dtype of input is np.float16, input_view 
+                will reinterpret input as with dtype np.uint16. 
+                Otherwise, input_view will be input itself.
+        """
+        if input.dtype == np.float16:
+            # view will only reinterpret memory without copying
+            input_view = input.view(np.uint16)
+        else:
+            input_view = input
+        return input_view
+
     def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
         prog = Program()
         block = prog.global_block()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index a49fecf0950..a16b6c8e927 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -68,6 +68,7 @@ class TestConv2dOp(OpTest):
         self.init_op_type()
         self.init_group()
         self.init_dilation()
+        self.init_data_type()
         self.init_test_case()
 
         conv2d_param = {
@@ -75,12 +76,22 @@ class TestConv2dOp(OpTest):
             'pad': self.pad,
             'dilation': self.dilations
         }
-        input = np.random.random(self.input_size).astype("float32")
-        filter = np.random.random(self.filter_size).astype("float32")
-        output = conv2d_forward_naive(input, filter, self.groups,
-                                      conv2d_param).astype('float32')
 
-        self.inputs = {'Input': input, 'Filter': filter}
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+        output = conv2d_forward_naive(self.input, self.filter, self.groups,
+                                      conv2d_param).astype(self.dtype)
+
+        # numpy float16 is binded to paddle::platform::float16 
+        # in tensor_py.h via the help of numpy uint16 because
+        # the internal memory representation of float16 is 
+        # uint16_t in paddle or np.uint16 in numpy, which are
+        # themselves binded together.        
+        self.inputs = {
+            'Input': input.view(np.uint16)
+            if self.dtype == np.float16 else input,
+            'Filter': create_view(filter)
+        }
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
@@ -148,6 +159,9 @@ class TestConv2dOp(OpTest):
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
 
+    def init_data_type(self):
+        self.dtype = np.float32
+
     def init_dilation(self):
         self.dilations = [1, 1]
 
@@ -232,6 +246,26 @@ class TestCUDNN(TestConv2dOp):
         self.op_type = "conv2d"
 
 
+class TestFP16CUDNN(TestCUDNN):
+    def init_data_type(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=1e-1)
+
+    def test_check_grad(self):
+        pass
+
+    def test_check_grad_no_filter(self):
+        pass
+
+    def test_check_grad_no_input(self):
+        pass
+
+
 class TestCUDNNWithPad(TestWithPad):
     def init_op_type(self):
         self.use_cudnn = True
-- 
GitLab