From 6b6f7a217b442acaec97f3fccaa04baaa6de2b45 Mon Sep 17 00:00:00 2001
From: ming1753 <61511741+ming1753@users.noreply.github.com>
Date: Fri, 22 Jul 2022 22:53:19 +0800
Subject: [PATCH] Fc fp16 (#44558)

* (modified) fc support fp16

* __CUDA_ARCH__ version

* delete half

* delete half
---
 paddle/phi/kernels/funcs/fc_functor.cu | 95 +-------------------------
 1 file changed, 1 insertion(+), 94 deletions(-)
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index d0bd7567c7d..d50bec2f635 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -36,14 +36,6 @@ struct FcTypeTraits<double> {
   typedef double4 Type;
 };
 
-#if defined(PADDLE_WITH_CUDA)
-#include <cuda_fp16.h>
-
-template <>
-struct FcTypeTraits<float16> {
-  typedef half2 Type;
-};
-#else
 struct float16_4 {
   float16 x, y, z, w;
 };
@@ -52,7 +44,6 @@ template <>
 struct FcTypeTraits<float16> {
   typedef float16_4 Type;
 };
-#endif
 
 template <typename T, bool DoRelu>
 __global__ void bias_relu_v4(const int num, const T* bias, T* data, int K) {
@@ -126,95 +117,12 @@ void AddReluKernel(
   }
 }
 
-#if defined(PADDLE_WITH_CUDA)
-template <bool DoRelu>
-__global__ void bias_relu_v2(const int num,
-                             const half2* bias,
-                             half2* data,
-                             int K) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < num) {
-    int bias_idx = tid % K;
-    const half2 bias_ptr = bias[bias_idx];
-    const half2 in_ptr = data[tid];
-    half2 packed_val = __hadd2(bias_ptr, in_ptr);
-    if (DoRelu) {
-#if __CUDA_ARCH__ >= 800
-      packed_val = __hmax2(__half2(0, 0), packed_val);
-#else
-      packed_val = __hmul2(__hgt2(__half2(0, 0), packed_val), packed_val);
-#endif
-    }
-    data[tid] = packed_val;
-  }
-}
-
-template <bool DoRelu, int BlockDim>
-__global__ void InplaceAddReluKernel(const int N,
-                                     const half* bias,
-                                     half* data) {
-  int offset = blockIdx.x * N;
-  for (int i = threadIdx.x; i < N; i += BlockDim) {
-    half temp;
-#if defined(__HIPCC__) || __CUDA_ARCH__ >= 350
-    temp = __ldg(data + offset + i) + __ldg(bias + i);
-#else
-    temp = data[offset + i] + bias[i];
-#endif
-    if (DoRelu) {
-#if __CUDA_ARCH__ >= 800
-      data[offset + i] = __hmax(0, temp);
-#else
-      data[offset + i] = __hmul(__hgt(temp, 0), temp);
-#endif
-    } else {
-      data[offset + i] = temp;
-    }
-  }
-}
-
-template <>
-void AddReluKernel(cudaStream_t stream,
-                   const int M,
-                   const int N,
-                   float16* Y,
-                   const float16* B,
-                   bool relu) {
-  if (N % 2 == 0) {
-    const int threads = 256;
-    const int num = M * N / 2;
-    const int blocks = (num + threads - 1) / threads;
-    typedef typename FcTypeTraits<float16>::Type trans_type;
-    auto* bias_ptr_v2 = reinterpret_cast<const trans_type*>(B);
-    auto* data_ptr_v2 = reinterpret_cast<trans_type*>(Y);
-    if (relu) {
-      bias_relu_v2<true><<<blocks, threads, 0, stream>>>(
-          num, bias_ptr_v2, data_ptr_v2, N / 2);
-    } else {
-      bias_relu_v2<false><<<blocks, threads, 0, stream>>>(
-          num, bias_ptr_v2, data_ptr_v2, N / 2);
-    }
-  } else {
-    const int threads = 256;
-    const int blocks = M;
-    auto* halfB = reinterpret_cast<const half*>(B);
-    auto* halfY = reinterpret_cast<half*>(Y);
-    if (relu) {
-      InplaceAddReluKernel<true, threads>
-          <<<blocks, threads, 0, stream>>>(N, halfB, halfY);
-    } else {
-      InplaceAddReluKernel<false, threads>
-          <<<blocks, threads, 0, stream>>>(N, halfB, halfY);
-    }
-  }
-}
-
-#else
 template <bool DoRelu, int BlockDim>
 __global__ void InplaceAddReluKernel(const int N,
                                      const float16* bias,
                                      float16* data) {
   int offset = blockIdx.x * N;
+
   for (int i = threadIdx.x; i < N; i += BlockDim) {
     float16 temp;
     temp = data[offset + i] + bias[i];
@@ -260,7 +168,6 @@ void AddReluKernel(gpuStream_t stream,
     }
   }
 }
-#endif
 
 template <typename DeviceContext, typename T>
 void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
-- 
GitLab