From c1187cd6f4a4fc071ff78e4829bb7fcfbe02f43c Mon Sep 17 00:00:00 2001 From: wangchaochaohu Date: Wed, 8 Apr 2020 19:39:08 +0800 Subject: [PATCH] Fp16 refine for fusion group (#23472) --- .../ir/fusion_group/code_generator.cc | 6 +- .../ir/fusion_group/code_generator_helper.cc | 78 +++---- .../ir/fusion_group/code_generator_helper.h | 1 - .../ir/fusion_group/cuda_resources.h | 219 +++++++++++++++++- .../framework/ir/fusion_group/operation.cc | 24 +- 5 files changed, 267 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc index e63764ae91..f88e872dba 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc @@ -36,7 +36,7 @@ std::string ExtractDataType(const std::vector& nodes) { } else if (dtype == proto::VarType::FP64) { dtype_str = "double"; } else if (dtype == proto::VarType::FP16) { - dtype_str = "float16"; + dtype_str = "__half"; } break; } @@ -147,13 +147,13 @@ std::string CodeGenerator::Generate( } std::string predefined_cuda_functions = ""; if (all_dtype.find("float") != all_dtype.end() && - all_dtype.find("float16") == all_dtype.end()) { + all_dtype.find("__half") == all_dtype.end()) { predefined_cuda_functions += predefined_cuda_functions_fp32; } if (all_dtype.find("double") != all_dtype.end()) { predefined_cuda_functions += predefined_cuda_functions_fp64; } - if (all_dtype.find("float16") != all_dtype.end()) { + if (all_dtype.find("__half") != all_dtype.end()) { predefined_cuda_functions += predefined_cuda_functions_fp16; } return predefined_cuda_functions + code_templates_[0].Format(template_var); diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc index 62e226d891..ed6007d36e 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc @@ -112,22 +112,7 @@ static std::string RefineTemplateWithAttr(const std::string& op_type, return ret; } -// In order to avoid multiple __half2float function calls, we do this -// optimization -static std::string OptimzeFP16RHS(std::unordered_set* used, - const int index, - const std::vector& input_ids) { - std::stringstream ret; - if (used->find(input_ids[index]) == used->end()) { - ret << "float half2fp32_" + TmpName(input_ids[index]) + " = __half2float(" + - TmpName(input_ids[index]) + ");"; - } - - return ret.str(); -} - std::string OperationExpression::GetRHS(std::unordered_set* used, - std::string* half2fp32_statement, size_t exprs_index) const { auto rhs = OperationMap::Instance().Get(op_type_).exprs[exprs_index]; auto num_operands = OperationMap::Instance().Get(op_type_).num_operands; @@ -136,16 +121,22 @@ std::string OperationExpression::GetRHS(std::unordered_set* used, size_t input_size = input_ids_.size(); rhs = ExpandMultivariateTemplate(rhs, input_size); } - for (size_t i = 0; i < rhs.size(); i++) { - size_t pos = i; + + size_t pos = 0; + while (pos < rhs.size()) { if (rhs[pos] == '$' && rhs[pos + 1] == '{') { - int length = 0; - while (rhs[pos + 2 + length] != '}') { - length++; + size_t length = 0; + int bracket_number = 1; + for (length = 0; (pos + 2 + length) < rhs.size(); length++) { + char ch = rhs[pos + 2 + length]; + if (ch == '}') bracket_number--; + if (ch == '{') bracket_number++; + if (bracket_number == 0) break; } std::string index_str = rhs.substr(pos + 2, length); std::string refine_str = RefineTemplateWithAttr(op_type_, index_str, attr_); + std::string var_name; if (index_str == refine_str) { int index = StringTo(index_str); PADDLE_ENFORCE_LT(index, input_ids_.size(), @@ -160,20 +151,31 @@ std::string OperationExpression::GetRHS(std::unordered_set* used, index, op_type_, input_ids_[index])); // TODO(wangchaochaohu): Here fp16 convert to float to do comupte, we // need to add general fp16 compute later. - std::string var_name; - if (rhs_type_ == "float16") { - half2fp32_statement->append(OptimzeFP16RHS(used, index, input_ids_)); - var_name = "half2fp32_" + TmpName(input_ids_[index]); - } else { - var_name = TmpName(input_ids_[index]); - } + var_name = TmpName(input_ids_[index]); rhs.replace(pos, length + 3, var_name); used->insert(input_ids_[index]); } else { - std::string var_name = refine_str; + var_name = refine_str; rhs.replace(pos, length + 3, var_name); } + pos = pos + var_name.length(); + } + pos++; + } + pos = 0; + while (pos < rhs.size()) { + if (rhs[pos] == '%' && rhs[pos + 1] == '{') { + int length = 0; + while (rhs[pos + 2 + length] != '}') { + length++; + } + std::string number_str = rhs.substr(pos + 2, length); + if (rhs_type_ == "__half") + number_str = "__float2half(" + number_str + ")"; + rhs.replace(pos, length + 3, number_str); + pos = pos + number_str.length(); } + pos++; } return rhs; } @@ -192,28 +194,24 @@ bool OperationExpression::IsSupport() const { // unique for the node which belong the group std::string OperationExpression::GetExpression( std::unordered_set* used) const { - std::string half2fp32_statement; std::stringstream ret; if (IsSupport()) { for (size_t i = 0; i < output_ids_.size(); ++i) { std::string cast_str = ""; - if ((lhs_type_ == rhs_type_ && rhs_type_ != "float16") || - (lhs_type_ != rhs_type_ && rhs_type_ == "float16")) { - ret << GetLHS(i) << " = " << GetRHS(used, &half2fp32_statement, i) - << ";"; + if (lhs_type_ == rhs_type_) { + ret << GetLHS(i) << " = " << GetRHS(used, i) << ";"; } else { - if ((lhs_type_ == rhs_type_ && rhs_type_ == "float16") || - lhs_type_ == "float16") { + if (lhs_type_ == "__half") cast_str = "__float2half"; - } else { + else if (rhs_type_ == "__half") + cast_str = "__half2float"; + else cast_str = "static_cast<" + lhs_type_ + ">"; - } - ret << GetLHS(i) << " = " << cast_str << "(" - << GetRHS(used, &half2fp32_statement, i) << ");"; + ret << GetLHS(i) << " = " << cast_str << "(" << GetRHS(used, i) << ");"; } } } - return half2fp32_statement + ret.str(); + return ret.str(); } } // namespace fusion_group diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h index 2a35ac6f63..2f4a5f5901 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h @@ -68,7 +68,6 @@ class OperationExpression { private: // TODO(wangchao): make offset more flexible we add stride and basic offset std::string GetRHS(std::unordered_set* used, - std::string* half2fp32_statement, size_t exprs_index = 0) const; std::string GetLHS(size_t i = 0) const; diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h index fa0bd96ccd..945b7929db 100644 --- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h +++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h @@ -36,11 +36,6 @@ __device__ inline double Sqrt(double x) { return sqrt(x); } )"; static constexpr char predefined_cuda_functions_fp16[] = R"( -__device__ inline float Max(float x, float y) { return fmaxf(x, y); } -__device__ inline float Exp(float x) { return expf(x); } -__device__ inline float Log(float x) { return logf(x); } -__device__ inline float Sqrt(float x) { return sqrtf(x); } - #define __HALF_TO_US(var) *(reinterpret_cast(&(var))) #define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) @@ -65,6 +60,219 @@ __device__ float __half2float(const __half h) { return val; } +#define __CUDA_FP16_DECL__ __host__ __device__ +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) do {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp."#name".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return val ? true : false; \ +} while(0); +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq); +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne); +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le); +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge); +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt); +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt); +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ); +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu); +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu); +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu); +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu); +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu); +} +#undef __COMPARISON_OP_HALF_MACRO + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +#define __BINARY_OP_HALF_MACRO(name) do {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} while(0); +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add); +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub); +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul); +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat); +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat); +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat); +} +#undef __BINARY_OP_HALF_MACRO +#define __TERNARY_OP_HALF_MACRO(name) do {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} while(0); +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn); +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat); +} +#undef __TERNARY_OP_HALF2_MACRO +__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) { + __half v, abs, den; + __HALF_TO_US(den) = 0x008F; + float fa, fb, fv, rcp; + + fa = __half2float(a); + fb = __half2float(b); + + asm("{rcp.approx.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + fv = rcp * fa; + + v = __float2half(fv); + __HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF); + if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) { + float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + + +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } + +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc,"#spc";\n"\ + " mov.b16 ulp,"#ulp";\n"\ + " set.eq.f16.f16 p,"#i", spc;\n"\ + " fma.rn.f16 "#r",p,ulp,"#r";\n}\n" + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79, 0x9400) + __SPEC_CASE(h, r, 0X25CF, 0x9400) + __SPEC_CASE(h, r, 0XC13B, 0x0400) + __SPEC_CASE(h, r, 0XC1EF, 0x0200) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} + +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.f32 f,f; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160D, 0x9C00) + __SPEC_CASE(h, r, 0X3BFE, 0x8010) + __SPEC_CASE(h, r, 0X3C0B, 0x8080) + __SPEC_CASE(h, r, 0X6051, 0x1C00) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} + +#define __APPROX_FCAST(fun) do {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " "#fun".approx.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} while(0); +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt); +} + +__device__ inline __half Exp(const __half x) { return hexp(x); } +__device__ inline __half Log(const __half x) { return hlog(x); } +__device__ inline __half Sqrt(const __half x) { return hsqrt(x); } + #undef __HALF_TO_US #undef __HALF_TO_CUS @@ -81,7 +289,6 @@ extern "C" __global__ void $func_name($parameters) { } } )"; - } // namespace fusion_group } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc index 6c4bb98e1b..ef71538457 100644 --- a/paddle/fluid/framework/ir/fusion_group/operation.cc +++ b/paddle/fluid/framework/ir/fusion_group/operation.cc @@ -91,17 +91,18 @@ void OperationMap::InsertUnaryElementwiseOperations() { // relu: // out = f(x) = x > 0 ? x : 0 // dx = dout * (out > 0 ? 1 : 0) - insert_handler("relu", "${0} > 0 ? ${0} : 0", {"${1} > 0 ? ${2} : 0"}); + insert_handler("relu", "${0} > %{0} ? ${0} : %{0.0}", + {"${1} > %{0.0} ? ${2} : %{0.0}"}); // sigmoid: // out = f(x) = 1.0 / (1.0 + exp(-x)) // dx = dout * out * (1 - out) - insert_handler("sigmoid", "1.0 / (1.0 + Exp(- ${0}))", - {"${2} * ${1} * (1.0 - ${1})"}); + insert_handler("sigmoid", "%{1.0} / (%{1.0} + Exp(- ${0}))", + {"${2} * ${1} * (%{1.0} - ${1})"}); // tanh: // out = f(x) = 2.0 / (1.0 + exp(-2.0 * x)) - 1.0; // dx = dout * (1 - out * out) - insert_handler("tanh", "2.0 / (1.0 + Exp(-2.0 * ${0})) - 1.0", - {"${2} * (1.0 - ${1} * ${1})"}); + insert_handler("tanh", "%{2.0} / (%{1.0} + Exp(-%{2.0} * ${0})) - %{1.0}", + {"${2} * (%{1.0} - ${1} * ${1})"}); // cast: // out = static_cast(x) @@ -112,22 +113,23 @@ void OperationMap::InsertUnaryElementwiseOperations() { // sqrt: // out = x^(1/2) // dx = dout * 0.5 / out - insert_handler("sqrt", "Sqrt(${0})", {"${2} * 0.5 / ${1}"}); + insert_handler("sqrt", "Sqrt(${0})", {"${2} * %{0.5} / ${1}"}); // square: // out = x^2 // dx = dout * 2.0 * x - insert_handler("square", "${0} * ${0}", {"${2} * 2.0 * ${0}"}); + insert_handler("square", "${0} * ${0}", {"${2} * %{2.0} * ${0}"}); // scale // out = (bias_after_scale) ? scale * X + bias : scale(X + bias) // here we use '=' operator to seperate th default value // TODO(wangchaochaohu): Later we need to support Tensor input for scale and // bias. - insert_handler("scale", - "${bias_after_scale=true} ? (${scale=1.0} * ${0} + " - "${bias=0.0}) : (${scale=1.0} * (${0} + ${bias=0.0}))", - {}); + insert_handler( + "scale", + "${bias_after_scale=true} ? (${scale=%{1.0}} * ${0} + " + "${bias=%{0.0}}) : (${scale=%{1.0}} * (${0} + ${bias=%{0.0}}))", + {}); } void OperationMap::InsertBinaryElementwiseOperations() { -- GitLab