From 809a10b67c35716950ec6f6ccbfde24ddc042652 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Thu, 27 Jan 2022 11:17:28 +0800 Subject: [PATCH] move math_cuda_utils.h to pten/kernels/funcs (#39246) --- paddle/fluid/operators/activation_op.cu | 1 - paddle/fluid/operators/interpolate_v2_op.cu | 17 +- .../operators/math/bert_encoder_functor.cu | 173 ++++++++++-------- .../operators/optimizers/lars_momentum_op.cu | 17 +- paddle/fluid/operators/softmax_cudnn_op.cu.h | 1 - .../kernels/funcs}/math_cuda_utils.h | 10 +- 6 files changed, 124 insertions(+), 95 deletions(-) rename paddle/{fluid/operators/math => pten/kernels/funcs}/math_cuda_utils.h (98%) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index b4a9386ce0..1ee5f35883 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -12,7 +12,6 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 72dd0fc743..a4d5e75e72 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -12,11 +12,11 @@ #include #include #include "paddle/fluid/operators/interpolate_v2_op.h" -#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/pten/kernels/funcs/math_cuda_utils.h" namespace paddle { namespace operators { @@ -522,7 +522,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block, if (threadIdx.x < threshold) { shared_last_idx = (threshold >> 5) - 1; - val = math::warpReduceMin(val, mask); + val = pten::funcs::warpReduceMin(val, mask); if (lane == 0) { shared[wid] = val; } @@ -537,7 +537,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block, if (threadIdx.x < threshold) { val = (lane <= shared_last_idx) ? shared[lane] : std::numeric_limits::max(); - val = math::warpReduceMin(val, mask); + val = pten::funcs::warpReduceMin(val, mask); shared_last_val = val; } __syncthreads(); @@ -589,12 +589,15 @@ __global__ void KeBilinearInterpBwShareMemory( s_data[0][threadIdx.x] = 0.f; s_data[1][threadIdx.x] = 0.f; int remain = nthreads - (tid & (-blockDim.x)); - int in_top_max_index = math::blockReduceMax(top_right_index, FINAL_MASK); - int in_bot_max_index = math::blockReduceMax(bot_right_index, FINAL_MASK); + int in_top_max_index = + pten::funcs::blockReduceMax(top_right_index, FINAL_MASK); + int in_bot_max_index = + pten::funcs::blockReduceMax(bot_right_index, FINAL_MASK); if (remain > blockDim.x) { - in_top_min_index = math::blockReduceMin(input_index, FINAL_MASK); - in_bot_min_index = math::blockReduceMin(bot_left_index, FINAL_MASK); + in_top_min_index = pten::funcs::blockReduceMin(input_index, FINAL_MASK); + in_bot_min_index = + pten::funcs::blockReduceMin(bot_left_index, FINAL_MASK); } else { in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK); in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK); diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 645d1f6371..b9b209646d 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -18,13 +18,17 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/pten/kernels/funcs/math_cuda_utils.h" namespace paddle { namespace operators { namespace math { +// NOTE(chenfeiyu): explicitly use operator+ for float2 +// since float2 is not in namespace pten::funcs, ADL won't help +using pten::funcs::operator+; + template __device__ __forceinline__ T local_rsqrt(T num) { return rsqrt(static_cast(num)); @@ -34,11 +38,12 @@ __device__ __forceinline__ half local_rsqrt(half num) { return hrsqrt(num); } #endif template -__device__ inline void LayerNormSmall(T val, const kvp &thread_data, +__device__ inline void LayerNormSmall(T val, + const pten::funcs::kvp &thread_data, const int ld, const int idx, const float *bias, const float *scale, T *output, T eps) { - using BlockReduce = cub::BlockReduce, TPB>; + using BlockReduce = cub::BlockReduce, TPB>; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ T mu; // mean __shared__ T rsigma; // 1 / std.dev. @@ -59,10 +64,11 @@ __device__ inline void LayerNormSmall(T val, const kvp &thread_data, } template -__device__ inline void LayerNorm(const kvp &thread_data, const int ld, - const int offset, const float *bias, - const float *scale, T *output, T eps) { - using BlockReduce = cub::BlockReduce, TPB>; +__device__ inline void LayerNorm(const pten::funcs::kvp &thread_data, + const int ld, const int offset, + const float *bias, const float *scale, + T *output, T eps) { + using BlockReduce = cub::BlockReduce, TPB>; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ T mu; // mean __shared__ T rsigma; // 1 / std.dev. @@ -85,10 +91,11 @@ __device__ inline void LayerNorm(const kvp &thread_data, const int ld, } template -__device__ inline void LayerNorm2(const kvp &thread_data, const int ld, - const int offset, const float2 *bias, - const float2 *scale, T2 *output, T eps) { - using BlockReduce = cub::BlockReduce, TPB>; +__device__ inline void LayerNorm2(const pten::funcs::kvp &thread_data, + const int ld, const int offset, + const float2 *bias, const float2 *scale, + T2 *output, T eps) { + using BlockReduce = cub::BlockReduce, TPB>; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ T mu; // mean __shared__ T rsigma; // 1 / std.dev. @@ -137,7 +144,7 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids, const int64_t out_offset = seq_pos * hidden; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); #pragma unroll for (int it = threadIdx.x; it < hidden; it += TPB) { @@ -148,7 +155,8 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids, output[out_offset + it] = val; const T rhiddenval = rhidden * val; - thread_data = pair_sum(thread_data, kvp(rhiddenval, rhiddenval * val)); + thread_data = pair_sum(thread_data, + pten::funcs::kvp(rhiddenval, rhiddenval * val)); } LayerNorm(thread_data, hidden, out_offset, bias, scale, output, eps); } @@ -180,7 +188,7 @@ __global__ void EmbEltwiseLayernormKernel( const int64_t out_offset = seq_pos * hidden; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); #pragma unroll for (int it = threadIdx.x; it < hidden; it += 256) { @@ -191,8 +199,8 @@ __global__ void EmbEltwiseLayernormKernel( output[out_offset + it] = val; const half rhiddenval = rhidden * val; - thread_data = - pair_sum(thread_data, kvp(rhiddenval, rhiddenval * val)); + thread_data = pair_sum( + thread_data, pten::funcs::kvp(rhiddenval, rhiddenval * val)); } LayerNorm(thread_data, hidden, out_offset, bias, scale, output, eps); @@ -233,10 +241,10 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_, ? static_cast(qk_buf_[threadIdx.x + qk_offset] + bias_qk_[threadIdx.x + qk_offset]) : -1e20f; - float max_val = blockReduceMax(tmp, mask); + float max_val = pten::funcs::blockReduceMax(tmp, mask); float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f; - float sum_val = blockReduceSum(qk_tmp, mask); + float sum_val = pten::funcs::blockReduceSum(qk_tmp, mask); if (threadIdx.x < seq_len) qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val); @@ -256,10 +264,10 @@ __global__ void SoftmaxKernelWithEltadd( ? static_cast(qk_buf_[threadIdx.x + qk_offset] + bias_qk_[threadIdx.x + qk_offset]) : -1e20f; - float max_val = blockReduceMax(tmp, mask); + float max_val = pten::funcs::blockReduceMax(tmp, mask); float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f; - float sum_val = blockReduceSum(qk_tmp, mask); + float sum_val = pten::funcs::blockReduceSum(qk_tmp, mask); if (threadIdx.x < seq_len) qk_buf_[threadIdx.x + qk_offset] = (half)(qk_tmp / sum_val); @@ -276,19 +284,20 @@ __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_, int idx = threadIdx.x; assert(blockDim.x % 32 == 0); - float2 tmp = - idx < seq_len - ? ToFloat2(qk_buf_[idx + qk_offset] + bias_qk_[idx + qk_offset]) - : make_float2(-1e20f, -1e20f); - float max_val = blockReduceMax(max(tmp.x, tmp.y), mask); + float2 tmp = idx < seq_len + ? pten::funcs::ToFloat2(qk_buf_[idx + qk_offset] + + bias_qk_[idx + qk_offset]) + : make_float2(-1e20f, -1e20f); + float max_val = pten::funcs::blockReduceMax(max(tmp.x, tmp.y), mask); float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val), __expf(tmp.y - max_val)) : make_float2(0.f, 0.f); - float sum_val = blockReduceSum(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; + float sum_val = + pten::funcs::blockReduceSum(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; if (idx < seq_len) { qk_buf_[idx + qk_offset] = - FloatsToPair(qk_tmp.x / sum_val, qk_tmp.y / sum_val); + pten::funcs::FloatsToPair(qk_tmp.x / sum_val, qk_tmp.y / sum_val); } } @@ -304,18 +313,20 @@ __global__ void SoftmaxKernelWithEltadd2( int idx = threadIdx.x; assert(blockDim.x % 32 == 0); - float2 tmp = idx < seq_len ? ToFloat2(qk_buf_[idx + qk_offset] + - bias_qk_[idx + qk_offset]) - : make_float2(-1e20f, -1e20f); - float max_val = blockReduceMax(max(tmp.x, tmp.y), mask); + float2 tmp = idx < seq_len + ? pten::funcs::ToFloat2(qk_buf_[idx + qk_offset] + + bias_qk_[idx + qk_offset]) + : make_float2(-1e20f, -1e20f); + float max_val = pten::funcs::blockReduceMax(max(tmp.x, tmp.y), mask); float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val), __expf(tmp.y - max_val)) : make_float2(0.f, 0.f); - float sum_val = blockReduceSum(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; + float sum_val = + pten::funcs::blockReduceSum(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; if (idx < seq_len) { - qk_buf_[idx + qk_offset] = - FloatsToPair(qk_tmp.x / sum_val, qk_tmp.y / sum_val); + qk_buf_[idx + qk_offset] = pten::funcs::FloatsToPair( + qk_tmp.x / sum_val, qk_tmp.y / sum_val); } #endif } @@ -338,14 +349,14 @@ __global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk, bias_qk[threadIdx.x + i + qk_offset] : stride_max; } - T max_val = blockReduceMax(stride_max, mask); + T max_val = pten::funcs::blockReduceMax(stride_max, mask); T stride_sum = 0.f; for (int i = 0; i < seq_len; i += blockDim.x) { stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] + bias_qk[threadIdx.x + i + qk_offset] - max_val); } - T sum_val = blockReduceSum(stride_sum, mask); + T sum_val = pten::funcs::blockReduceSum(stride_sum, mask); for (int i = 0; i < seq_len; i += blockDim.x) { qk_buf[threadIdx.x + i + qk_offset] = @@ -371,7 +382,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge( bias_qk[threadIdx.x + i + qk_offset]); stride_max = tmp > stride_max ? tmp : stride_max; } - float max_val = blockReduceMax(stride_max, mask); + float max_val = pten::funcs::blockReduceMax(stride_max, mask); float stride_sum = 0.f; for (int i = 0; i < seq_len; i += blockDim.x) { @@ -379,7 +390,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge( bias_qk[threadIdx.x + i + qk_offset]); stride_sum += __expf(tmp - max_val); } - float sum_val = blockReduceSum(stride_sum, mask); + float sum_val = pten::funcs::blockReduceSum(stride_sum, mask); for (int i = 0; i < seq_len; i += blockDim.x) { float tmp = @@ -403,28 +414,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_, float2 stride_max = make_float2(-1e20f, -1e20f); for (int i = 0; i < seq_len; i += blockDim.x) { - float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + - bias_qk_[threadIdx.x + i + qk_offset]); + float2 cur = + pten::funcs::ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); stride_max.x = max(stride_max.x, cur.x); stride_max.y = max(stride_max.y, cur.y); } - float max_val = blockReduceMax(max(stride_max.x, stride_max.y), mask); + float max_val = + pten::funcs::blockReduceMax(max(stride_max.x, stride_max.y), mask); float2 stride_sum = make_float2(0.f, 0.f); for (int i = 0; i < seq_len; i += blockDim.x) { - float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + - bias_qk_[threadIdx.x + i + qk_offset]); + float2 cur = + pten::funcs::ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); stride_sum.x += __expf(cur.x - max_val); stride_sum.y += __expf(cur.y - max_val); } float sum_val = - blockReduceSum(stride_sum.x + stride_sum.y, mask) + 1e-6f; + pten::funcs::blockReduceSum(stride_sum.x + stride_sum.y, mask) + + 1e-6f; for (int i = 0; i < seq_len; i += blockDim.x) { - float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + - bias_qk_[threadIdx.x + i + qk_offset]); - qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair( + float2 cur = + pten::funcs::ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + qk_buf_[threadIdx.x + i + qk_offset] = pten::funcs::FloatsToPair( __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val); } } @@ -443,28 +459,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2( float2 stride_max = make_float2(-1e20f, -1e20f); for (int i = 0; i < seq_len; i += blockDim.x) { - float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + - bias_qk_[threadIdx.x + i + qk_offset]); + float2 cur = + pten::funcs::ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); stride_max.x = max(stride_max.x, cur.x); stride_max.y = max(stride_max.y, cur.y); } - float max_val = blockReduceMax(max(stride_max.x, stride_max.y), mask); + float max_val = + pten::funcs::blockReduceMax(max(stride_max.x, stride_max.y), mask); float2 stride_sum = make_float2(0.f, 0.f); for (int i = 0; i < seq_len; i += blockDim.x) { - float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + - bias_qk_[threadIdx.x + i + qk_offset]); + float2 cur = + pten::funcs::ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); stride_sum.x += __expf(cur.x - max_val); stride_sum.y += __expf(cur.y - max_val); } float sum_val = - blockReduceSum(stride_sum.x + stride_sum.y, mask) + 1e-6f; + pten::funcs::blockReduceSum(stride_sum.x + stride_sum.y, mask) + + 1e-6f; for (int i = 0; i < seq_len; i += blockDim.x) { - float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + - bias_qk_[threadIdx.x + i + qk_offset]); - qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair( + float2 cur = + pten::funcs::ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + qk_buf_[threadIdx.x + i + qk_offset] = pten::funcs::FloatsToPair( __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val); } #endif @@ -595,13 +616,14 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1, const T rld = T(1) / T(hidden); const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); const int idx = offset + threadIdx.x; T val = 0; if (threadIdx.x < hidden) { val = input1[idx] + input2[idx]; const T rldval = rld * val; - thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + thread_data = + pair_sum(thread_data, pten::funcs::kvp(rldval, rldval * val)); } LayerNormSmall(val, thread_data, hidden, idx, bias, scale, output, eps); @@ -617,13 +639,14 @@ __global__ void SkipLayerNormSmallKernel( const half rld = half(1) / half(hidden); const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); const int idx = offset + threadIdx.x; half val = 0; if (threadIdx.x < hidden) { val = input1[idx] + input2[idx]; const half rldval = rld * val; - thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + thread_data = + pair_sum(thread_data, pten::funcs::kvp(rldval, rldval * val)); } LayerNormSmall(val, thread_data, hidden, idx, bias, scale, output, eps); @@ -638,13 +661,14 @@ __global__ void SkipLayerNormSmallKernel( const half rld = half(1) / half(hidden); const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); const int idx = offset + threadIdx.x; half val = 0; if (threadIdx.x < hidden) { val = input1[idx] + input2[idx]; const half rldval = rld * val; - thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + thread_data = + pair_sum(thread_data, pten::funcs::kvp(rldval, rldval * val)); } LayerNormSmall(val, thread_data, hidden, idx, bias, scale, output, eps); @@ -659,13 +683,14 @@ __global__ void SkipLayerNormSmallKernel( const half rld = half(1) / half(hidden); const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); const int idx = offset + threadIdx.x; half val = 0; if (threadIdx.x < hidden) { val = input1[idx] + input2[idx]; const half rldval = rld * val; - thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + thread_data = + pair_sum(thread_data, pten::funcs::kvp(rldval, rldval * val)); } LayerNormSmall(val, thread_data, hidden, idx, bias, scale, output, eps); @@ -681,13 +706,14 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1, const T rld = T(1) / T(hidden); const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); for (int it = threadIdx.x; it < hidden; it += TPB) { const int idx = offset + it; const T val = input1[idx] + input2[idx]; const T rldval = rld * val; - thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + thread_data = + pair_sum(thread_data, pten::funcs::kvp(rldval, rldval * val)); output[idx] = val; } LayerNorm(thread_data, hidden, offset, bias, scale, output, eps); @@ -705,13 +731,14 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const half rld = half(1) / half(hidden); const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); for (int it = threadIdx.x; it < hidden; it += 256) { const int idx = offset + it; const half val = input1[idx] + input2[idx]; const half rldval = rld * val; - thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + thread_data = + pair_sum(thread_data, pten::funcs::kvp(rldval, rldval * val)); output[idx] = val; } LayerNorm(thread_data, hidden, offset, bias, scale, output, eps); @@ -727,13 +754,14 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1, const T rld = T(0.5f / hidden); // because hidden is hidden/2 const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); for (int it = threadIdx.x; it < hidden; it += TPB) { const int idx = offset + it; const T2 val2 = input1[idx] + input2[idx]; thread_data = pair_sum( - thread_data, kvp(rld * (val2.x + val2.y), + thread_data, + pten::funcs::kvp(rld * (val2.x + val2.y), rld * val2.x * val2.x + rld * val2.y * val2.y)); output[idx] = val2; } @@ -751,13 +779,14 @@ __global__ void SkipLayerNormKernel2( const half rld = half(0.5f / hidden); // because hidden is hidden/2 const int offset = blockIdx.x * hidden; cub::Sum pair_sum; - kvp thread_data(0, 0); + pten::funcs::kvp thread_data(0, 0); for (int it = threadIdx.x; it < hidden; it += 256) { const int idx = offset + it; const half2 val2 = input1[idx] + input2[idx]; thread_data = pair_sum( - thread_data, kvp(rld * (val2.x + val2.y), + thread_data, + pten::funcs::kvp(rld * (val2.x + val2.y), rld * val2.x * val2.x + rld * val2.y * val2.y)); output[idx] = val2; } diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 2c27a2135c..78a00b8533 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" #include "paddle/fluid/platform/fast_divmod.h" +#include "paddle/pten/kernels/funcs/math_cuda_utils.h" #if CUDA_VERSION >= 11000 #include @@ -170,8 +170,8 @@ __global__ void L2NormKernel( g_tmp += (tmp1 * tmp1); tid += grid_stride; } - p_tmp = math::blockReduceSum(p_tmp, FINAL_MASK); - g_tmp = math::blockReduceSum(g_tmp, FINAL_MASK); + p_tmp = pten::funcs::blockReduceSum(p_tmp, FINAL_MASK); + g_tmp = pten::funcs::blockReduceSum(g_tmp, FINAL_MASK); if (threadIdx.x == 0) { p_buffer[blockIdx.x] = p_tmp; @@ -181,8 +181,8 @@ __global__ void L2NormKernel( cg->sync(); // Grid sync for writring partial result to gloabl memory MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; - MT tmp0 = math::blockReduceSum(p_part_sum, FINAL_MASK); - MT tmp1 = math::blockReduceSum(g_part_sum, FINAL_MASK); + MT tmp0 = pten::funcs::blockReduceSum(p_part_sum, FINAL_MASK); + MT tmp1 = pten::funcs::blockReduceSum(g_part_sum, FINAL_MASK); if (threadIdx.x == 0) { s_buffer[0] = tmp0; s_buffer[1] = tmp1; @@ -294,9 +294,10 @@ __global__ void MomentumLarsKernel( MT param_part_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0; MT grad_part_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0; __syncthreads(); - MT param_norm = Sqrt(math::blockReduceSum(param_part_norm, FINAL_MASK)); - MT grad_norm = Sqrt(rescale_grad_pow * - math::blockReduceSum(grad_part_norm, FINAL_MASK)); + MT param_norm = + Sqrt(pten::funcs::blockReduceSum(param_part_norm, FINAL_MASK)); + MT grad_norm = Sqrt(rescale_grad_pow * pten::funcs::blockReduceSum( + grad_part_norm, FINAL_MASK)); #endif MomentumUpdate(param, grad, velocity, param_out, velocity_out, master_param, master_param_out, learning_rate, mu, diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/fluid/operators/softmax_cudnn_op.cu.h index 0c10152c23..236ea448f3 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.h +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" -#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" diff --git a/paddle/fluid/operators/math/math_cuda_utils.h b/paddle/pten/kernels/funcs/math_cuda_utils.h similarity index 98% rename from paddle/fluid/operators/math/math_cuda_utils.h rename to paddle/pten/kernels/funcs/math_cuda_utils.h index 8de4e8221c..5d2a49c6b6 100644 --- a/paddle/fluid/operators/math/math_cuda_utils.h +++ b/paddle/pten/kernels/funcs/math_cuda_utils.h @@ -23,9 +23,8 @@ limitations under the License. */ #include -namespace paddle { -namespace operators { -namespace math { +namespace pten { +namespace funcs { template __device__ __forceinline__ T FromFloat(float a); @@ -315,6 +314,5 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) { return val; } -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace pten -- GitLab