Created by: wangchaochaohu
之前处理fp16代码的方式比较复杂,同时转换成fp32会存在性能问题 this pr fp16 code
extern "C" __global__ void FusedElementwise15(int N, __half* arg0, __half* arg1, __half* arg2, float* arg3, __half* arg4, __half* arg5, __half* arg6, __half* arg7, float* arg8) {
for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += gridDim.x * blockDim.x) {
__half tmp1 = arg1[idx];
float tmp3 = arg3[idx];
__half tmp4 = __float2half(tmp3);
__half tmp5 = tmp1 > __float2half(0.0) ? tmp4 : __float2half(0.0);
__half tmp6 = tmp5;
__half tmp7 = tmp5;
float tmp8 = __half2float(tmp7);
arg4[idx] = tmp4;
arg5[idx] = tmp5;
arg6[idx] = tmp6;
arg7[idx] = tmp7;
arg8[idx] = tmp8;
}
}
In Reset50 we compare with Develop code with https://github.com/PaddlePaddle/Paddle/pull/23317 performance has improve
using profiler tool This PR Fusion Group OP GPU time 163.071208 develop with PR mentioned above Fusion Group Op GPU time: 176.647332