diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 272cf3573fb2f27c62bce86f0e97b8e567b245ae..94ef22506a231911a325ae296c721b904dd337e9 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -67,22 +67,6 @@ __global__ void SumSelectedRowsCUDAKernel(T **sr_in_out, int64_t N, } } -template -__global__ void SumAlign4CUDAKernel(const T *in_0, const T *in_1, T *out, - int64_t N) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = id; i < N / 4; i += blockDim.x * gridDim.x) { - const float4 *in0_4 = reinterpret_cast(in_0); - const float4 *in1_4 = reinterpret_cast(in_1); - float4 tmp; - tmp.x = in0_4[i].x + in1_4[i].x; - tmp.y = in0_4[i].y + in1_4[i].y; - tmp.z = in0_4[i].z + in1_4[i].z; - tmp.w = in0_4[i].w + in1_4[i].w; - reinterpret_cast(out)[i] = tmp; - } -} - template void SumToLoDTensor(const framework::ExecutionContext &context) { auto in_vars = context.MultiInputVar("X");