未验证 提交 809a10b6 编写于 作者: F Feiyu Chan 提交者: GitHub

move math_cuda_utils.h to pten/kernels/funcs (#39246)

上级 3e6950d5
...@@ -12,7 +12,6 @@ limitations under the License. */ ...@@ -12,7 +12,6 @@ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
......
...@@ -12,11 +12,11 @@ ...@@ -12,11 +12,11 @@
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include "paddle/fluid/operators/interpolate_v2_op.h" #include "paddle/fluid/operators/interpolate_v2_op.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/pten/kernels/funcs/math_cuda_utils.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -522,7 +522,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block, ...@@ -522,7 +522,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,
if (threadIdx.x < threshold) { if (threadIdx.x < threshold) {
shared_last_idx = (threshold >> 5) - 1; shared_last_idx = (threshold >> 5) - 1;
val = math::warpReduceMin(val, mask); val = pten::funcs::warpReduceMin(val, mask);
if (lane == 0) { if (lane == 0) {
shared[wid] = val; shared[wid] = val;
} }
...@@ -537,7 +537,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block, ...@@ -537,7 +537,7 @@ __inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,
if (threadIdx.x < threshold) { if (threadIdx.x < threshold) {
val = (lane <= shared_last_idx) ? shared[lane] val = (lane <= shared_last_idx) ? shared[lane]
: std::numeric_limits<T>::max(); : std::numeric_limits<T>::max();
val = math::warpReduceMin(val, mask); val = pten::funcs::warpReduceMin(val, mask);
shared_last_val = val; shared_last_val = val;
} }
__syncthreads(); __syncthreads();
...@@ -589,12 +589,15 @@ __global__ void KeBilinearInterpBwShareMemory( ...@@ -589,12 +589,15 @@ __global__ void KeBilinearInterpBwShareMemory(
s_data[0][threadIdx.x] = 0.f; s_data[0][threadIdx.x] = 0.f;
s_data[1][threadIdx.x] = 0.f; s_data[1][threadIdx.x] = 0.f;
int remain = nthreads - (tid & (-blockDim.x)); int remain = nthreads - (tid & (-blockDim.x));
int in_top_max_index = math::blockReduceMax(top_right_index, FINAL_MASK); int in_top_max_index =
int in_bot_max_index = math::blockReduceMax(bot_right_index, FINAL_MASK); pten::funcs::blockReduceMax(top_right_index, FINAL_MASK);
int in_bot_max_index =
pten::funcs::blockReduceMax(bot_right_index, FINAL_MASK);
if (remain > blockDim.x) { if (remain > blockDim.x) {
in_top_min_index = math::blockReduceMin(input_index, FINAL_MASK); in_top_min_index = pten::funcs::blockReduceMin(input_index, FINAL_MASK);
in_bot_min_index = math::blockReduceMin(bot_left_index, FINAL_MASK); in_bot_min_index =
pten::funcs::blockReduceMin(bot_left_index, FINAL_MASK);
} else { } else {
in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK); in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK);
in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK); in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK);
......
...@@ -18,13 +18,17 @@ limitations under the License. */ ...@@ -18,13 +18,17 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/kernels/funcs/math_cuda_utils.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
// NOTE(chenfeiyu): explicitly use operator+ for float2
// since float2 is not in namespace pten::funcs, ADL won't help
using pten::funcs::operator+;
template <typename T> template <typename T>
__device__ __forceinline__ T local_rsqrt(T num) { __device__ __forceinline__ T local_rsqrt(T num) {
return rsqrt(static_cast<float>(num)); return rsqrt(static_cast<float>(num));
...@@ -34,11 +38,12 @@ __device__ __forceinline__ half local_rsqrt(half num) { return hrsqrt(num); } ...@@ -34,11 +38,12 @@ __device__ __forceinline__ half local_rsqrt(half num) { return hrsqrt(num); }
#endif #endif
template <typename T, int TPB> template <typename T, int TPB>
__device__ inline void LayerNormSmall(T val, const kvp<T> &thread_data, __device__ inline void LayerNormSmall(T val,
const pten::funcs::kvp<T> &thread_data,
const int ld, const int idx, const int ld, const int idx,
const float *bias, const float *scale, const float *bias, const float *scale,
T *output, T eps) { T *output, T eps) {
using BlockReduce = cub::BlockReduce<kvp<T>, TPB>; using BlockReduce = cub::BlockReduce<pten::funcs::kvp<T>, TPB>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ T mu; // mean __shared__ T mu; // mean
__shared__ T rsigma; // 1 / std.dev. __shared__ T rsigma; // 1 / std.dev.
...@@ -59,10 +64,11 @@ __device__ inline void LayerNormSmall(T val, const kvp<T> &thread_data, ...@@ -59,10 +64,11 @@ __device__ inline void LayerNormSmall(T val, const kvp<T> &thread_data,
} }
template <typename T, int TPB> template <typename T, int TPB>
__device__ inline void LayerNorm(const kvp<T> &thread_data, const int ld, __device__ inline void LayerNorm(const pten::funcs::kvp<T> &thread_data,
const int offset, const float *bias, const int ld, const int offset,
const float *scale, T *output, T eps) { const float *bias, const float *scale,
using BlockReduce = cub::BlockReduce<kvp<T>, TPB>; T *output, T eps) {
using BlockReduce = cub::BlockReduce<pten::funcs::kvp<T>, TPB>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ T mu; // mean __shared__ T mu; // mean
__shared__ T rsigma; // 1 / std.dev. __shared__ T rsigma; // 1 / std.dev.
...@@ -85,10 +91,11 @@ __device__ inline void LayerNorm(const kvp<T> &thread_data, const int ld, ...@@ -85,10 +91,11 @@ __device__ inline void LayerNorm(const kvp<T> &thread_data, const int ld,
} }
template <typename T, typename T2, int TPB> template <typename T, typename T2, int TPB>
__device__ inline void LayerNorm2(const kvp<T> &thread_data, const int ld, __device__ inline void LayerNorm2(const pten::funcs::kvp<T> &thread_data,
const int offset, const float2 *bias, const int ld, const int offset,
const float2 *scale, T2 *output, T eps) { const float2 *bias, const float2 *scale,
using BlockReduce = cub::BlockReduce<kvp<T>, TPB>; T2 *output, T eps) {
using BlockReduce = cub::BlockReduce<pten::funcs::kvp<T>, TPB>;
__shared__ typename BlockReduce::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage;
__shared__ T mu; // mean __shared__ T mu; // mean
__shared__ T rsigma; // 1 / std.dev. __shared__ T rsigma; // 1 / std.dev.
...@@ -137,7 +144,7 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids, ...@@ -137,7 +144,7 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
const int64_t out_offset = seq_pos * hidden; const int64_t out_offset = seq_pos * hidden;
kvp<T> thread_data(0, 0); pten::funcs::kvp<T> thread_data(0, 0);
#pragma unroll #pragma unroll
for (int it = threadIdx.x; it < hidden; it += TPB) { for (int it = threadIdx.x; it < hidden; it += TPB) {
...@@ -148,7 +155,8 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids, ...@@ -148,7 +155,8 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
output[out_offset + it] = val; output[out_offset + it] = val;
const T rhiddenval = rhidden * val; const T rhiddenval = rhidden * val;
thread_data = pair_sum(thread_data, kvp<T>(rhiddenval, rhiddenval * val)); thread_data = pair_sum(thread_data,
pten::funcs::kvp<T>(rhiddenval, rhiddenval * val));
} }
LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps); LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps);
} }
...@@ -180,7 +188,7 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>( ...@@ -180,7 +188,7 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>(
const int64_t out_offset = seq_pos * hidden; const int64_t out_offset = seq_pos * hidden;
kvp<half> thread_data(0, 0); pten::funcs::kvp<half> thread_data(0, 0);
#pragma unroll #pragma unroll
for (int it = threadIdx.x; it < hidden; it += 256) { for (int it = threadIdx.x; it < hidden; it += 256) {
...@@ -191,8 +199,8 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>( ...@@ -191,8 +199,8 @@ __global__ void EmbEltwiseLayernormKernel<half, 256>(
output[out_offset + it] = val; output[out_offset + it] = val;
const half rhiddenval = rhidden * val; const half rhiddenval = rhidden * val;
thread_data = thread_data = pair_sum(
pair_sum(thread_data, kvp<half>(rhiddenval, rhiddenval * val)); thread_data, pten::funcs::kvp<half>(rhiddenval, rhiddenval * val));
} }
LayerNorm<half, 256>(thread_data, hidden, out_offset, bias, scale, output, LayerNorm<half, 256>(thread_data, hidden, out_offset, bias, scale, output,
eps); eps);
...@@ -233,10 +241,10 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_, ...@@ -233,10 +241,10 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_,
? static_cast<float>(qk_buf_[threadIdx.x + qk_offset] + ? static_cast<float>(qk_buf_[threadIdx.x + qk_offset] +
bias_qk_[threadIdx.x + qk_offset]) bias_qk_[threadIdx.x + qk_offset])
: -1e20f; : -1e20f;
float max_val = blockReduceMax<float>(tmp, mask); float max_val = pten::funcs::blockReduceMax<float>(tmp, mask);
float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f; float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f;
float sum_val = blockReduceSum<float>(qk_tmp, mask); float sum_val = pten::funcs::blockReduceSum<float>(qk_tmp, mask);
if (threadIdx.x < seq_len) if (threadIdx.x < seq_len)
qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val); qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val);
...@@ -256,10 +264,10 @@ __global__ void SoftmaxKernelWithEltadd<half>( ...@@ -256,10 +264,10 @@ __global__ void SoftmaxKernelWithEltadd<half>(
? static_cast<float>(qk_buf_[threadIdx.x + qk_offset] + ? static_cast<float>(qk_buf_[threadIdx.x + qk_offset] +
bias_qk_[threadIdx.x + qk_offset]) bias_qk_[threadIdx.x + qk_offset])
: -1e20f; : -1e20f;
float max_val = blockReduceMax<float>(tmp, mask); float max_val = pten::funcs::blockReduceMax<float>(tmp, mask);
float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f; float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f;
float sum_val = blockReduceSum<float>(qk_tmp, mask); float sum_val = pten::funcs::blockReduceSum<float>(qk_tmp, mask);
if (threadIdx.x < seq_len) if (threadIdx.x < seq_len)
qk_buf_[threadIdx.x + qk_offset] = (half)(qk_tmp / sum_val); qk_buf_[threadIdx.x + qk_offset] = (half)(qk_tmp / sum_val);
...@@ -276,19 +284,20 @@ __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_, ...@@ -276,19 +284,20 @@ __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_,
int idx = threadIdx.x; int idx = threadIdx.x;
assert(blockDim.x % 32 == 0); assert(blockDim.x % 32 == 0);
float2 tmp = float2 tmp = idx < seq_len
idx < seq_len ? pten::funcs::ToFloat2<T>(qk_buf_[idx + qk_offset] +
? ToFloat2<T>(qk_buf_[idx + qk_offset] + bias_qk_[idx + qk_offset]) bias_qk_[idx + qk_offset])
: make_float2(-1e20f, -1e20f); : make_float2(-1e20f, -1e20f);
float max_val = blockReduceMax<float>(max(tmp.x, tmp.y), mask); float max_val = pten::funcs::blockReduceMax<float>(max(tmp.x, tmp.y), mask);
float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val), float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val),
__expf(tmp.y - max_val)) __expf(tmp.y - max_val))
: make_float2(0.f, 0.f); : make_float2(0.f, 0.f);
float sum_val = blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; float sum_val =
pten::funcs::blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f;
if (idx < seq_len) { if (idx < seq_len) {
qk_buf_[idx + qk_offset] = qk_buf_[idx + qk_offset] =
FloatsToPair<T>(qk_tmp.x / sum_val, qk_tmp.y / sum_val); pten::funcs::FloatsToPair<T>(qk_tmp.x / sum_val, qk_tmp.y / sum_val);
} }
} }
...@@ -304,18 +313,20 @@ __global__ void SoftmaxKernelWithEltadd2<half2>( ...@@ -304,18 +313,20 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
int idx = threadIdx.x; int idx = threadIdx.x;
assert(blockDim.x % 32 == 0); assert(blockDim.x % 32 == 0);
float2 tmp = idx < seq_len ? ToFloat2<half2>(qk_buf_[idx + qk_offset] + float2 tmp = idx < seq_len
? pten::funcs::ToFloat2<half2>(qk_buf_[idx + qk_offset] +
bias_qk_[idx + qk_offset]) bias_qk_[idx + qk_offset])
: make_float2(-1e20f, -1e20f); : make_float2(-1e20f, -1e20f);
float max_val = blockReduceMax<float>(max(tmp.x, tmp.y), mask); float max_val = pten::funcs::blockReduceMax<float>(max(tmp.x, tmp.y), mask);
float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val), float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val),
__expf(tmp.y - max_val)) __expf(tmp.y - max_val))
: make_float2(0.f, 0.f); : make_float2(0.f, 0.f);
float sum_val = blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; float sum_val =
pten::funcs::blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f;
if (idx < seq_len) { if (idx < seq_len) {
qk_buf_[idx + qk_offset] = qk_buf_[idx + qk_offset] = pten::funcs::FloatsToPair<half2>(
FloatsToPair<half2>(qk_tmp.x / sum_val, qk_tmp.y / sum_val); qk_tmp.x / sum_val, qk_tmp.y / sum_val);
} }
#endif #endif
} }
...@@ -338,14 +349,14 @@ __global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk, ...@@ -338,14 +349,14 @@ __global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk,
bias_qk[threadIdx.x + i + qk_offset] bias_qk[threadIdx.x + i + qk_offset]
: stride_max; : stride_max;
} }
T max_val = blockReduceMax<T>(stride_max, mask); T max_val = pten::funcs::blockReduceMax<T>(stride_max, mask);
T stride_sum = 0.f; T stride_sum = 0.f;
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] + stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] +
bias_qk[threadIdx.x + i + qk_offset] - max_val); bias_qk[threadIdx.x + i + qk_offset] - max_val);
} }
T sum_val = blockReduceSum<T>(stride_sum, mask); T sum_val = pten::funcs::blockReduceSum<T>(stride_sum, mask);
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
qk_buf[threadIdx.x + i + qk_offset] = qk_buf[threadIdx.x + i + qk_offset] =
...@@ -371,7 +382,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge( ...@@ -371,7 +382,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge(
bias_qk[threadIdx.x + i + qk_offset]); bias_qk[threadIdx.x + i + qk_offset]);
stride_max = tmp > stride_max ? tmp : stride_max; stride_max = tmp > stride_max ? tmp : stride_max;
} }
float max_val = blockReduceMax<float>(stride_max, mask); float max_val = pten::funcs::blockReduceMax<float>(stride_max, mask);
float stride_sum = 0.f; float stride_sum = 0.f;
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
...@@ -379,7 +390,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge( ...@@ -379,7 +390,7 @@ __global__ void SoftmaxKernelWithEltaddForLarge(
bias_qk[threadIdx.x + i + qk_offset]); bias_qk[threadIdx.x + i + qk_offset]);
stride_sum += __expf(tmp - max_val); stride_sum += __expf(tmp - max_val);
} }
float sum_val = blockReduceSum<float>(stride_sum, mask); float sum_val = pten::funcs::blockReduceSum<float>(stride_sum, mask);
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
float tmp = float tmp =
...@@ -403,28 +414,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_, ...@@ -403,28 +414,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_,
float2 stride_max = make_float2(-1e20f, -1e20f); float2 stride_max = make_float2(-1e20f, -1e20f);
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] + float2 cur =
pten::funcs::ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
bias_qk_[threadIdx.x + i + qk_offset]); bias_qk_[threadIdx.x + i + qk_offset]);
stride_max.x = max(stride_max.x, cur.x); stride_max.x = max(stride_max.x, cur.x);
stride_max.y = max(stride_max.y, cur.y); stride_max.y = max(stride_max.y, cur.y);
} }
float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask); float max_val =
pten::funcs::blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
float2 stride_sum = make_float2(0.f, 0.f); float2 stride_sum = make_float2(0.f, 0.f);
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] + float2 cur =
pten::funcs::ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
bias_qk_[threadIdx.x + i + qk_offset]); bias_qk_[threadIdx.x + i + qk_offset]);
stride_sum.x += __expf(cur.x - max_val); stride_sum.x += __expf(cur.x - max_val);
stride_sum.y += __expf(cur.y - max_val); stride_sum.y += __expf(cur.y - max_val);
} }
float sum_val = float sum_val =
blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f; pten::funcs::blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) +
1e-6f;
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] + float2 cur =
pten::funcs::ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
bias_qk_[threadIdx.x + i + qk_offset]); bias_qk_[threadIdx.x + i + qk_offset]);
qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<T>( qk_buf_[threadIdx.x + i + qk_offset] = pten::funcs::FloatsToPair<T>(
__expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val); __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
} }
} }
...@@ -443,28 +459,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2( ...@@ -443,28 +459,33 @@ __global__ void SoftmaxKernelWithEltaddForLarge2(
float2 stride_max = make_float2(-1e20f, -1e20f); float2 stride_max = make_float2(-1e20f, -1e20f);
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] + float2 cur =
pten::funcs::ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
bias_qk_[threadIdx.x + i + qk_offset]); bias_qk_[threadIdx.x + i + qk_offset]);
stride_max.x = max(stride_max.x, cur.x); stride_max.x = max(stride_max.x, cur.x);
stride_max.y = max(stride_max.y, cur.y); stride_max.y = max(stride_max.y, cur.y);
} }
float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask); float max_val =
pten::funcs::blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
float2 stride_sum = make_float2(0.f, 0.f); float2 stride_sum = make_float2(0.f, 0.f);
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] + float2 cur =
pten::funcs::ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
bias_qk_[threadIdx.x + i + qk_offset]); bias_qk_[threadIdx.x + i + qk_offset]);
stride_sum.x += __expf(cur.x - max_val); stride_sum.x += __expf(cur.x - max_val);
stride_sum.y += __expf(cur.y - max_val); stride_sum.y += __expf(cur.y - max_val);
} }
float sum_val = float sum_val =
blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f; pten::funcs::blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) +
1e-6f;
for (int i = 0; i < seq_len; i += blockDim.x) { for (int i = 0; i < seq_len; i += blockDim.x) {
float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] + float2 cur =
pten::funcs::ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
bias_qk_[threadIdx.x + i + qk_offset]); bias_qk_[threadIdx.x + i + qk_offset]);
qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<half2>( qk_buf_[threadIdx.x + i + qk_offset] = pten::funcs::FloatsToPair<half2>(
__expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val); __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
} }
#endif #endif
...@@ -595,13 +616,14 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1, ...@@ -595,13 +616,14 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1,
const T rld = T(1) / T(hidden); const T rld = T(1) / T(hidden);
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<T> thread_data(0, 0); pten::funcs::kvp<T> thread_data(0, 0);
const int idx = offset + threadIdx.x; const int idx = offset + threadIdx.x;
T val = 0; T val = 0;
if (threadIdx.x < hidden) { if (threadIdx.x < hidden) {
val = input1[idx] + input2[idx]; val = input1[idx] + input2[idx];
const T rldval = rld * val; const T rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<T>(rldval, rldval * val)); thread_data =
pair_sum(thread_data, pten::funcs::kvp<T>(rldval, rldval * val));
} }
LayerNormSmall<T, TPB>(val, thread_data, hidden, idx, bias, scale, output, LayerNormSmall<T, TPB>(val, thread_data, hidden, idx, bias, scale, output,
eps); eps);
...@@ -617,13 +639,14 @@ __global__ void SkipLayerNormSmallKernel<half, 32>( ...@@ -617,13 +639,14 @@ __global__ void SkipLayerNormSmallKernel<half, 32>(
const half rld = half(1) / half(hidden); const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<half> thread_data(0, 0); pten::funcs::kvp<half> thread_data(0, 0);
const int idx = offset + threadIdx.x; const int idx = offset + threadIdx.x;
half val = 0; half val = 0;
if (threadIdx.x < hidden) { if (threadIdx.x < hidden) {
val = input1[idx] + input2[idx]; val = input1[idx] + input2[idx];
const half rldval = rld * val; const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val)); thread_data =
pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
} }
LayerNormSmall<half, 32>(val, thread_data, hidden, idx, bias, scale, output, LayerNormSmall<half, 32>(val, thread_data, hidden, idx, bias, scale, output,
eps); eps);
...@@ -638,13 +661,14 @@ __global__ void SkipLayerNormSmallKernel<half, 128>( ...@@ -638,13 +661,14 @@ __global__ void SkipLayerNormSmallKernel<half, 128>(
const half rld = half(1) / half(hidden); const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<half> thread_data(0, 0); pten::funcs::kvp<half> thread_data(0, 0);
const int idx = offset + threadIdx.x; const int idx = offset + threadIdx.x;
half val = 0; half val = 0;
if (threadIdx.x < hidden) { if (threadIdx.x < hidden) {
val = input1[idx] + input2[idx]; val = input1[idx] + input2[idx];
const half rldval = rld * val; const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val)); thread_data =
pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
} }
LayerNormSmall<half, 128>(val, thread_data, hidden, idx, bias, scale, output, LayerNormSmall<half, 128>(val, thread_data, hidden, idx, bias, scale, output,
eps); eps);
...@@ -659,13 +683,14 @@ __global__ void SkipLayerNormSmallKernel<half, 384>( ...@@ -659,13 +683,14 @@ __global__ void SkipLayerNormSmallKernel<half, 384>(
const half rld = half(1) / half(hidden); const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<half> thread_data(0, 0); pten::funcs::kvp<half> thread_data(0, 0);
const int idx = offset + threadIdx.x; const int idx = offset + threadIdx.x;
half val = 0; half val = 0;
if (threadIdx.x < hidden) { if (threadIdx.x < hidden) {
val = input1[idx] + input2[idx]; val = input1[idx] + input2[idx];
const half rldval = rld * val; const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val)); thread_data =
pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
} }
LayerNormSmall<half, 384>(val, thread_data, hidden, idx, bias, scale, output, LayerNormSmall<half, 384>(val, thread_data, hidden, idx, bias, scale, output,
eps); eps);
...@@ -681,13 +706,14 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1, ...@@ -681,13 +706,14 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
const T rld = T(1) / T(hidden); const T rld = T(1) / T(hidden);
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<T> thread_data(0, 0); pten::funcs::kvp<T> thread_data(0, 0);
for (int it = threadIdx.x; it < hidden; it += TPB) { for (int it = threadIdx.x; it < hidden; it += TPB) {
const int idx = offset + it; const int idx = offset + it;
const T val = input1[idx] + input2[idx]; const T val = input1[idx] + input2[idx];
const T rldval = rld * val; const T rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<T>(rldval, rldval * val)); thread_data =
pair_sum(thread_data, pten::funcs::kvp<T>(rldval, rldval * val));
output[idx] = val; output[idx] = val;
} }
LayerNorm<T, TPB>(thread_data, hidden, offset, bias, scale, output, eps); LayerNorm<T, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
...@@ -705,13 +731,14 @@ __global__ void SkipLayerNormKernel<half, 256>(int num, int hidden, ...@@ -705,13 +731,14 @@ __global__ void SkipLayerNormKernel<half, 256>(int num, int hidden,
const half rld = half(1) / half(hidden); const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<half> thread_data(0, 0); pten::funcs::kvp<half> thread_data(0, 0);
for (int it = threadIdx.x; it < hidden; it += 256) { for (int it = threadIdx.x; it < hidden; it += 256) {
const int idx = offset + it; const int idx = offset + it;
const half val = input1[idx] + input2[idx]; const half val = input1[idx] + input2[idx];
const half rldval = rld * val; const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val)); thread_data =
pair_sum(thread_data, pten::funcs::kvp<half>(rldval, rldval * val));
output[idx] = val; output[idx] = val;
} }
LayerNorm<half, 256>(thread_data, hidden, offset, bias, scale, output, eps); LayerNorm<half, 256>(thread_data, hidden, offset, bias, scale, output, eps);
...@@ -727,13 +754,14 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1, ...@@ -727,13 +754,14 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
const T rld = T(0.5f / hidden); // because hidden is hidden/2 const T rld = T(0.5f / hidden); // because hidden is hidden/2
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<T> thread_data(0, 0); pten::funcs::kvp<T> thread_data(0, 0);
for (int it = threadIdx.x; it < hidden; it += TPB) { for (int it = threadIdx.x; it < hidden; it += TPB) {
const int idx = offset + it; const int idx = offset + it;
const T2 val2 = input1[idx] + input2[idx]; const T2 val2 = input1[idx] + input2[idx];
thread_data = pair_sum( thread_data = pair_sum(
thread_data, kvp<T>(rld * (val2.x + val2.y), thread_data,
pten::funcs::kvp<T>(rld * (val2.x + val2.y),
rld * val2.x * val2.x + rld * val2.y * val2.y)); rld * val2.x * val2.x + rld * val2.y * val2.y));
output[idx] = val2; output[idx] = val2;
} }
...@@ -751,13 +779,14 @@ __global__ void SkipLayerNormKernel2<half, half2, 256>( ...@@ -751,13 +779,14 @@ __global__ void SkipLayerNormKernel2<half, half2, 256>(
const half rld = half(0.5f / hidden); // because hidden is hidden/2 const half rld = half(0.5f / hidden); // because hidden is hidden/2
const int offset = blockIdx.x * hidden; const int offset = blockIdx.x * hidden;
cub::Sum pair_sum; cub::Sum pair_sum;
kvp<half> thread_data(0, 0); pten::funcs::kvp<half> thread_data(0, 0);
for (int it = threadIdx.x; it < hidden; it += 256) { for (int it = threadIdx.x; it < hidden; it += 256) {
const int idx = offset + it; const int idx = offset + it;
const half2 val2 = input1[idx] + input2[idx]; const half2 val2 = input1[idx] + input2[idx];
thread_data = pair_sum( thread_data = pair_sum(
thread_data, kvp<half>(rld * (val2.x + val2.y), thread_data,
pten::funcs::kvp<half>(rld * (val2.x + val2.y),
rld * val2.x * val2.x + rld * val2.y * val2.y)); rld * val2.x * val2.x + rld * val2.y * val2.y));
output[idx] = val2; output[idx] = val2;
} }
......
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h" #include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/pten/kernels/funcs/math_cuda_utils.h"
#if CUDA_VERSION >= 11000 #if CUDA_VERSION >= 11000
#include <cooperative_groups.h> #include <cooperative_groups.h>
...@@ -170,8 +170,8 @@ __global__ void L2NormKernel( ...@@ -170,8 +170,8 @@ __global__ void L2NormKernel(
g_tmp += (tmp1 * tmp1); g_tmp += (tmp1 * tmp1);
tid += grid_stride; tid += grid_stride;
} }
p_tmp = math::blockReduceSum<MT>(p_tmp, FINAL_MASK); p_tmp = pten::funcs::blockReduceSum<MT>(p_tmp, FINAL_MASK);
g_tmp = math::blockReduceSum<MT>(g_tmp, FINAL_MASK); g_tmp = pten::funcs::blockReduceSum<MT>(g_tmp, FINAL_MASK);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
p_buffer[blockIdx.x] = p_tmp; p_buffer[blockIdx.x] = p_tmp;
...@@ -181,8 +181,8 @@ __global__ void L2NormKernel( ...@@ -181,8 +181,8 @@ __global__ void L2NormKernel(
cg->sync(); // Grid sync for writring partial result to gloabl memory cg->sync(); // Grid sync for writring partial result to gloabl memory
MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0;
MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0;
MT tmp0 = math::blockReduceSum<MT>(p_part_sum, FINAL_MASK); MT tmp0 = pten::funcs::blockReduceSum<MT>(p_part_sum, FINAL_MASK);
MT tmp1 = math::blockReduceSum<MT>(g_part_sum, FINAL_MASK); MT tmp1 = pten::funcs::blockReduceSum<MT>(g_part_sum, FINAL_MASK);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
s_buffer[0] = tmp0; s_buffer[0] = tmp0;
s_buffer[1] = tmp1; s_buffer[1] = tmp1;
...@@ -294,9 +294,10 @@ __global__ void MomentumLarsKernel( ...@@ -294,9 +294,10 @@ __global__ void MomentumLarsKernel(
MT param_part_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0; MT param_part_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0;
MT grad_part_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0; MT grad_part_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0;
__syncthreads(); __syncthreads();
MT param_norm = Sqrt(math::blockReduceSum<MT>(param_part_norm, FINAL_MASK)); MT param_norm =
MT grad_norm = Sqrt(rescale_grad_pow * Sqrt(pten::funcs::blockReduceSum<MT>(param_part_norm, FINAL_MASK));
math::blockReduceSum<MT>(grad_part_norm, FINAL_MASK)); MT grad_norm = Sqrt(rescale_grad_pow * pten::funcs::blockReduceSum<MT>(
grad_part_norm, FINAL_MASK));
#endif #endif
MomentumUpdate<T, MT>(param, grad, velocity, param_out, velocity_out, MomentumUpdate<T, MT>(param, grad, velocity, param_out, velocity_out,
master_param, master_param_out, learning_rate, mu, master_param, master_param_out, learning_rate, mu,
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/operators/math/math_cuda_utils.h"
#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
......
...@@ -23,9 +23,8 @@ limitations under the License. */ ...@@ -23,9 +23,8 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
namespace paddle { namespace pten {
namespace operators { namespace funcs {
namespace math {
template <typename T> template <typename T>
__device__ __forceinline__ T FromFloat(float a); __device__ __forceinline__ T FromFloat(float a);
...@@ -315,6 +314,5 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) { ...@@ -315,6 +314,5 @@ __inline__ __device__ T PartialBlockReduceMin(T val, unsigned mask) {
return val; return val;
} }
} // namespace math } // namespace funcs
} // namespace operators } // namespace pten
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册