From c1187cd6f4a4fc071ff78e4829bb7fcfbe02f43c Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Wed, 8 Apr 2020 19:39:08 +0800
Subject: [PATCH] Fp16 refine for fusion group (#23472)

---
 .../ir/fusion_group/code_generator.cc         |   6 +-
 .../ir/fusion_group/code_generator_helper.cc  |  78 +++----
 .../ir/fusion_group/code_generator_helper.h   |   1 -
 .../ir/fusion_group/cuda_resources.h          | 219 +++++++++++++++++-
 .../framework/ir/fusion_group/operation.cc    |  24 +-
 5 files changed, 267 insertions(+), 61 deletions(-)
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
index e63764ae915..f88e872dbad 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -36,7 +36,7 @@ std::string ExtractDataType(const std::vector<Node*>& nodes) {
       } else if (dtype == proto::VarType::FP64) {
         dtype_str = "double";
       } else if (dtype == proto::VarType::FP16) {
-        dtype_str = "float16";
+        dtype_str = "__half";
       }
       break;
     }
@@ -147,13 +147,13 @@ std::string CodeGenerator::Generate(
   }
   std::string predefined_cuda_functions = "";
   if (all_dtype.find("float") != all_dtype.end() &&
-      all_dtype.find("float16") == all_dtype.end()) {
+      all_dtype.find("__half") == all_dtype.end()) {
     predefined_cuda_functions += predefined_cuda_functions_fp32;
   }
   if (all_dtype.find("double") != all_dtype.end()) {
     predefined_cuda_functions += predefined_cuda_functions_fp64;
   }
-  if (all_dtype.find("float16") != all_dtype.end()) {
+  if (all_dtype.find("__half") != all_dtype.end()) {
     predefined_cuda_functions += predefined_cuda_functions_fp16;
   }
   return predefined_cuda_functions + code_templates_[0].Format(template_var);
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
index 62e226d891f..ed6007d36e0 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
@@ -112,22 +112,7 @@ static std::string RefineTemplateWithAttr(const std::string& op_type,
   return ret;
 }
 
-// In order to avoid multiple __half2float function calls, we do this
-// optimization
-static std::string OptimzeFP16RHS(std::unordered_set<int>* used,
-                                  const int index,
-                                  const std::vector<int>& input_ids) {
-  std::stringstream ret;
-  if (used->find(input_ids[index]) == used->end()) {
-    ret << "float half2fp32_" + TmpName(input_ids[index]) + " = __half2float(" +
-               TmpName(input_ids[index]) + ");";
-  }
-
-  return ret.str();
-}
-
 std::string OperationExpression::GetRHS(std::unordered_set<int>* used,
-                                        std::string* half2fp32_statement,
                                         size_t exprs_index) const {
   auto rhs = OperationMap::Instance().Get(op_type_).exprs[exprs_index];
   auto num_operands = OperationMap::Instance().Get(op_type_).num_operands;
@@ -136,16 +121,22 @@ std::string OperationExpression::GetRHS(std::unordered_set<int>* used,
     size_t input_size = input_ids_.size();
     rhs = ExpandMultivariateTemplate(rhs, input_size);
   }
-  for (size_t i = 0; i < rhs.size(); i++) {
-    size_t pos = i;
+
+  size_t pos = 0;
+  while (pos < rhs.size()) {
     if (rhs[pos] == '$' && rhs[pos + 1] == '{') {
-      int length = 0;
-      while (rhs[pos + 2 + length] != '}') {
-        length++;
+      size_t length = 0;
+      int bracket_number = 1;
+      for (length = 0; (pos + 2 + length) < rhs.size(); length++) {
+        char ch = rhs[pos + 2 + length];
+        if (ch == '}') bracket_number--;
+        if (ch == '{') bracket_number++;
+        if (bracket_number == 0) break;
       }
       std::string index_str = rhs.substr(pos + 2, length);
       std::string refine_str =
           RefineTemplateWithAttr(op_type_, index_str, attr_);
+      std::string var_name;
       if (index_str == refine_str) {
         int index = StringTo<int>(index_str);
         PADDLE_ENFORCE_LT(index, input_ids_.size(),
@@ -160,20 +151,31 @@ std::string OperationExpression::GetRHS(std::unordered_set<int>* used,
                               index, op_type_, input_ids_[index]));
         // TODO(wangchaochaohu): Here fp16 convert to float to do comupte, we
         // need to add general fp16 compute later.
-        std::string var_name;
-        if (rhs_type_ == "float16") {
-          half2fp32_statement->append(OptimzeFP16RHS(used, index, input_ids_));
-          var_name = "half2fp32_" + TmpName(input_ids_[index]);
-        } else {
-          var_name = TmpName(input_ids_[index]);
-        }
+        var_name = TmpName(input_ids_[index]);
         rhs.replace(pos, length + 3, var_name);
         used->insert(input_ids_[index]);
       } else {
-        std::string var_name = refine_str;
+        var_name = refine_str;
         rhs.replace(pos, length + 3, var_name);
       }
+      pos = pos + var_name.length();
+    }
+    pos++;
+  }
+  pos = 0;
+  while (pos < rhs.size()) {
+    if (rhs[pos] == '%' && rhs[pos + 1] == '{') {
+      int length = 0;
+      while (rhs[pos + 2 + length] != '}') {
+        length++;
+      }
+      std::string number_str = rhs.substr(pos + 2, length);
+      if (rhs_type_ == "__half")
+        number_str = "__float2half(" + number_str + ")";
+      rhs.replace(pos, length + 3, number_str);
+      pos = pos + number_str.length();
     }
+    pos++;
   }
   return rhs;
 }
@@ -192,28 +194,24 @@ bool OperationExpression::IsSupport() const {
 // unique for the node which belong the group
 std::string OperationExpression::GetExpression(
     std::unordered_set<int>* used) const {
-  std::string half2fp32_statement;
   std::stringstream ret;
   if (IsSupport()) {
     for (size_t i = 0; i < output_ids_.size(); ++i) {
       std::string cast_str = "";
-      if ((lhs_type_ == rhs_type_ && rhs_type_ != "float16") ||
-          (lhs_type_ != rhs_type_ && rhs_type_ == "float16")) {
-        ret << GetLHS(i) << " = " << GetRHS(used, &half2fp32_statement, i)
-            << ";";
+      if (lhs_type_ == rhs_type_) {
+        ret << GetLHS(i) << " = " << GetRHS(used, i) << ";";
       } else {
-        if ((lhs_type_ == rhs_type_ && rhs_type_ == "float16") ||
-            lhs_type_ == "float16") {
+        if (lhs_type_ == "__half")
           cast_str = "__float2half";
-        } else {
+        else if (rhs_type_ == "__half")
+          cast_str = "__half2float";
+        else
           cast_str = "static_cast<" + lhs_type_ + ">";
-        }
-        ret << GetLHS(i) << " = " << cast_str << "("
-            << GetRHS(used, &half2fp32_statement, i) << ");";
+        ret << GetLHS(i) << " = " << cast_str << "(" << GetRHS(used, i) << ");";
       }
     }
   }
-  return half2fp32_statement + ret.str();
+  return ret.str();
 }
 
 }  // namespace fusion_group
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
index 2a35ac6f632..2f4a5f5901f 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
@@ -68,7 +68,6 @@ class OperationExpression {
  private:
   // TODO(wangchao): make offset more flexible we add stride and basic offset
   std::string GetRHS(std::unordered_set<int>* used,
-                     std::string* half2fp32_statement,
                      size_t exprs_index = 0) const;
   std::string GetLHS(size_t i = 0) const;
 
diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
index fa0bd96ccda..945b7929db9 100644
--- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
@@ -36,11 +36,6 @@ __device__ inline double Sqrt(double x) { return sqrt(x); }
 )";
 
 static constexpr char predefined_cuda_functions_fp16[] = R"(
-__device__ inline float Max(float x, float y) { return fmaxf(x, y); }
-__device__ inline float Exp(float x) { return expf(x); }
-__device__ inline float Log(float x) { return logf(x); }
-__device__ inline float Sqrt(float x) { return sqrtf(x); }
-
 #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
 #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
 
@@ -65,6 +60,219 @@ __device__ float __half2float(const __half h) {
   return val;
 }
 
+#define __CUDA_FP16_DECL__ __host__ __device__
+/******************************************************************************
+*                             __half comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_HALF_MACRO(name) do {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp."#name".f16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
+   return val ? true : false; \
+} while(0);
+__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(eq);
+}
+__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ne);
+}
+__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(le);
+}
+__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ge);
+}
+__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(lt);
+}
+__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(gt);
+}
+__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(equ);
+}
+__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(neu);
+}
+__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(leu);
+}
+__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(geu);
+}
+__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ltu);
+}
+__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(gtu);
+}
+#undef __COMPARISON_OP_HALF_MACRO
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+#define __BINARY_OP_HALF_MACRO(name) do {\
+   __half val; \
+   asm( "{"#name".f16 %0,%1,%2;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
+   return val; \
+} while(0);
+__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add);
+}
+__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub);
+}
+__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul);
+}
+__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add.sat);
+}
+__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub.sat);
+}
+__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul.sat);
+}
+#undef __BINARY_OP_HALF_MACRO
+#define __TERNARY_OP_HALF_MACRO(name) do {\
+   __half val; \
+   asm( "{"#name".f16 %0,%1,%2,%3;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
+   return val; \
+} while(0);
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn);
+}
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.sat);
+}
+#undef __TERNARY_OP_HALF2_MACRO
+__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) {
+    __half v, abs, den;
+    __HALF_TO_US(den) = 0x008F;
+    float fa, fb, fv, rcp;
+
+    fa = __half2float(a);
+    fb = __half2float(b);
+
+    asm("{rcp.approx.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
+
+    fv = rcp * fa;
+
+    v = __float2half(fv);
+    __HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF);
+    if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) {
+        float err = __fmaf_rn(-fb, fv, fa);
+        fv = __fmaf_rn(rcp, err, fv);
+        v = __float2half(fv);
+    }
+    return v;
+}
+
+
+/* Some basic arithmetic operations expected of a builtin */
+__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
+__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
+__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
+__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
+
+/* Some basic comparison operations to make it look like a builtin */
+__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); }
+__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
+__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
+
+#define __SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc,"#spc";\n"\
+   "  mov.b16 ulp,"#ulp";\n"\
+   "  set.eq.f16.f16 p,"#i", spc;\n"\
+   "  fma.rn.f16 "#r",p,ulp,"#r";\n}\n"
+
+__CUDA_FP16_DECL__ __half hexp(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         h,r;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  mov.b32         C, 0x3fb8aa3b;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  ex2.approx.f32      f,f;        \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X1F79, 0x9400)
+        __SPEC_CASE(h, r, 0X25CF, 0x9400)
+        __SPEC_CASE(h, r, 0XC13B, 0x0400)
+        __SPEC_CASE(h, r, 0XC1EF, 0x0200)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+
+__CUDA_FP16_DECL__ __half hlog(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  lg2.approx.f32      f,f;        \n"
+        "  mov.b32         C, 0x3f317218;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X160D, 0x9C00)
+        __SPEC_CASE(h, r, 0X3BFE, 0x8010)
+        __SPEC_CASE(h, r, 0X3C0B, 0x8080)
+        __SPEC_CASE(h, r, 0X6051, 0x1C00)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+
+#define __APPROX_FCAST(fun) do {\
+   __half val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  cvt.f32.f16     f,r;      \n"\
+                "  "#fun".approx.f32   f,f;  \n"\
+                "  cvt.rn.f16.f32      r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
+   return val;\
+} while(0);
+__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
+    __APPROX_FCAST(sqrt);
+}
+
+__device__ inline __half Exp(const __half x) { return hexp(x); }
+__device__ inline __half Log(const __half x) { return hlog(x); }
+__device__ inline __half Sqrt(const __half x) { return hsqrt(x); }
+
 #undef __HALF_TO_US
 #undef __HALF_TO_CUS
 
@@ -81,7 +289,6 @@ extern "C" __global__ void $func_name($parameters) {
   }
 }
 )";
-
 }  // namespace fusion_group
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index 6c4bb98e1b6..ef715384573 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -91,17 +91,18 @@ void OperationMap::InsertUnaryElementwiseOperations() {
   // relu:
   //  out = f(x) = x > 0 ? x : 0
   //  dx = dout * (out > 0 ? 1 : 0)
-  insert_handler("relu", "${0} > 0 ? ${0} : 0", {"${1} > 0 ? ${2} : 0"});
+  insert_handler("relu", "${0} > %{0} ? ${0} : %{0.0}",
+                 {"${1} > %{0.0} ? ${2} : %{0.0}"});
   // sigmoid:
   //  out = f(x) = 1.0 / (1.0 + exp(-x))
   //  dx = dout * out * (1 - out)
-  insert_handler("sigmoid", "1.0 / (1.0 + Exp(- ${0}))",
-                 {"${2} * ${1} * (1.0 - ${1})"});
+  insert_handler("sigmoid", "%{1.0} / (%{1.0} + Exp(- ${0}))",
+                 {"${2} * ${1} * (%{1.0} - ${1})"});
   // tanh:
   //  out = f(x) = 2.0 / (1.0 + exp(-2.0 * x)) - 1.0;
   //  dx = dout * (1 - out * out)
-  insert_handler("tanh", "2.0 / (1.0 + Exp(-2.0 * ${0})) - 1.0",
-                 {"${2} * (1.0 - ${1} * ${1})"});
+  insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}",
+                 {"${2} * (%{1.0} - ${1} * ${1})"});
 
   // cast:
   // out = static_cast<T>(x)
@@ -112,22 +113,23 @@ void OperationMap::InsertUnaryElementwiseOperations() {
   // sqrt:
   //  out = x^(1/2)
   //  dx = dout * 0.5 / out
-  insert_handler("sqrt", "Sqrt(${0})", {"${2} * 0.5 / ${1}"});
+  insert_handler("sqrt", "Sqrt(${0})", {"${2} * %{0.5} / ${1}"});
 
   // square:
   //  out = x^2
   //  dx = dout * 2.0 * x
-  insert_handler("square", "${0} * ${0}", {"${2} * 2.0 * ${0}"});
+  insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"});
 
   // scale
   // out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
   // here we use '=' operator to seperate th default value
   // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
   // bias.
-  insert_handler("scale",
-                 "${bias_after_scale=true} ? (${scale=1.0} * ${0} + "
-                 "${bias=0.0}) : (${scale=1.0} * (${0} + ${bias=0.0}))",
-                 {});
+  insert_handler(
+      "scale",
+      "${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + "
+      "${bias=%{0.0}}) : (${scale=%{1.0}} * (${0} + ${bias=%{0.0}}))",
+      {});
 }
 
 void OperationMap::InsertBinaryElementwiseOperations() {
-- 
GitLab