未验证 提交 9aa7b638 编写于 作者: C Connor Holmes 提交者: GitHub

Kernel Data Conversion Utility (#2327)

* Unify macro definitions and constants in a single file

* Conversion utility implementation.

* Fix reversion from formatting

* Bugfixes after testing with correct DeepSpeed

* Inline markers are available on both HIP + CUDA
上级 99326438
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
#pragma once
#include "ds_kernel_utils.h"
#include <cuda_fp16.h>
#include <stdint.h>
#ifdef BF16_AVAILABLE
#include <cuda_bf16.h>
#endif
namespace conversion {
// Basic primitive for constructing conversions
template <typename TO, typename FROM>
DS_D_INLINE TO to(FROM val)
{
return to(val);
}
// Specializations
/********************* Identity Conversions *********************/
/*
Identity conversions are useful in templated functions where we might have
a fixed destination type. For example, I might have a kernel that accepts
__half, __nv_bfloat16, and float but always want to do the core computation
at floating point:
T mem_value = input[idx];
float compute_value = conversion::to<float, T>(mem_value);
In practice, we should be able to elide the second template parameter:
float compute_val = conversion::to<float>(mem_value);
In this case, we need an implementation to handle the T = float case
NOTE: The type inferencing system appears to be unable to handle inferring the first
template parameter, even in the trivial case.
*/
// Floating point types
template <>
DS_D_INLINE double to(double val)
{
return val;
}
template <>
DS_D_INLINE float to(float val)
{
return val;
}
template <>
DS_D_INLINE __half to(__half val)
{
return val;
}
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE __nv_bfloat16 to(__nv_bfloat16 val)
{
return val;
}
#endif
// Integer types
template <>
DS_D_INLINE int8_t to(int8_t val)
{
return val;
}
template <>
DS_D_INLINE uint8_t to(uint8_t val)
{
return val;
}
template <>
DS_D_INLINE int16_t to(int16_t val)
{
return val;
}
template <>
DS_D_INLINE uint16_t to(uint16_t val)
{
return val;
}
template <>
DS_D_INLINE int32_t to(int32_t val)
{
return val;
}
template <>
DS_D_INLINE uint32_t to(uint32_t val)
{
return val;
}
template <>
DS_D_INLINE int64_t to(int64_t val)
{
return val;
}
template <>
DS_D_INLINE uint64_t to(uint64_t val)
{
return val;
}
// TODO: evaluate if we want bools
/********************* To Double Conversions *********************/
// * to double variants
// Would normally like to not use C cast, but this is an important enough conversion
// to keep
template <>
DS_D_INLINE double to(float val)
{
#ifdef PTX_AVAILABLE
double ret_val;
asm("ctv.rn.f64.f32 %0, %1;\n" : "=d"(ret_val) : "f"(val));
return ret_val;
#else
return double(val);
#endif
}
// Note: there is a CVT instruction for __half -> double, but there's no inline interface
// for passing a single half value
template <>
DS_D_INLINE double to(__half val)
{
return to<double>(__half2float(val));
}
template <>
DS_D_INLINE double to(int64_t val)
{
return __ll2double_rn(val);
}
template <>
DS_D_INLINE double to(int32_t val)
{
return __int2double_rn(val);
}
template <>
DS_D_INLINE double to(int16_t val)
{
return __int2double_rn(val);
}
template <>
DS_D_INLINE double to(int8_t val)
{
return __int2double_rn(val);
}
template <>
DS_D_INLINE double to(uint64_t val)
{
return __ull2double_rn(val);
}
template <>
DS_D_INLINE double to(uint32_t val)
{
return __uint2double_rn(val);
}
template <>
DS_D_INLINE double to(uint16_t val)
{
return __uint2double_rn(val);
}
template <>
DS_D_INLINE double to(uint8_t val)
{
return __uint2double_rn(val);
}
// Same applies here
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE double to(__nv_bfloat16 val)
{
return to<double>(__bfloat162float(val));
}
#endif
/********************* To Float Conversions *********************/
template <>
DS_D_INLINE float to(double val)
{
return __double2float_rn(val);
}
template <>
DS_D_INLINE float to(__half val)
{
return __half2float(val);
}
template <>
DS_D_INLINE float to(int64_t val)
{
return __ll2float_rn(val);
}
template <>
DS_D_INLINE float to(int32_t val)
{
return __int2float_rn(val);
}
template <>
DS_D_INLINE float to(int16_t val)
{
return __int2float_rn(val);
}
template <>
DS_D_INLINE float to(int8_t val)
{
return __int2float_rn(val);
}
template <>
DS_D_INLINE float to(uint64_t val)
{
return __ull2float_rn(val);
}
template <>
DS_D_INLINE float to(uint32_t val)
{
return __uint2float_rn(val);
}
template <>
DS_D_INLINE float to(uint16_t val)
{
return __uint2float_rn(val);
}
template <>
DS_D_INLINE float to(uint8_t val)
{
return __uint2float_rn(val);
}
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE float to(__nv_bfloat16 val)
{
return __bfloat162float(val);
}
#endif
/********************* To Float2 Conversions *********************/
template <>
DS_D_INLINE float2 to(__half2 val)
{
return __half22float2(val);
}
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE float2 to(__nv_bfloat162 val)
{
return __bfloat1622float2(val);
}
#endif
/********************* To Half Conversions *********************/
template <>
DS_D_INLINE __half to(double val)
{
return __double2half(val);
}
template <>
DS_D_INLINE __half to(float val)
{
return __float2half(val);
}
template <>
DS_D_INLINE __half to(int64_t val)
{
return __ll2half_rn(val);
}
template <>
DS_D_INLINE __half to(int32_t val)
{
return __int2half_rn(val);
}
template <>
DS_D_INLINE __half to(int16_t val)
{
return __short2half_rn(val);
}
template <>
DS_D_INLINE __half to(int8_t val)
{
return __int2half_rn(val);
}
template <>
DS_D_INLINE __half to(uint64_t val)
{
return __ull2half_rn(val);
}
template <>
DS_D_INLINE __half to(uint32_t val)
{
return __uint2half_rn(val);
}
template <>
DS_D_INLINE __half to(uint16_t val)
{
return __ushort2half_rn(val);
}
template <>
DS_D_INLINE __half to(uint8_t val)
{
return __uint2half_rn(val);
}
#ifdef BF16_AVAILABLE
// No direct conversion
template <>
DS_D_INLINE __half to(__nv_bfloat16 val)
{
return to<__half>(to<float>(val));
}
#endif
/********************* To Half2 Conversions *********************/
template <>
DS_D_INLINE __half2 to(float2 val)
{
return __float22half2_rn(val);
}
#ifdef BF16_AVAILABLE
// No direct conversion
template <>
DS_D_INLINE __half2 to(__nv_bfloat162 val)
{
return to<__half2>(to<float2>(val));
}
#endif
/********************* To BF16 Conversions *********************/
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE __nv_bfloat16 to(double val)
{
return __double2bfloat16(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(float val)
{
return __float2bfloat16(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(int64_t val)
{
return __ll2bfloat16_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(int32_t val)
{
return __int2bfloat16_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(int16_t val)
{
return __short2bfloat16_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(int8_t val)
{
return __int2bfloat16_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(uint64_t val)
{
return __ull2bfloat16_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(uint32_t val)
{
return __uint2bfloat16_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(uint16_t val)
{
return __ushort2bfloat16_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat16 to(uint8_t val)
{
return __uint2bfloat16_rn(val);
}
#endif
/********************* To BF162 Conversions *********************/
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE __nv_bfloat162 to(float2 val)
{
return __float22bfloat162_rn(val);
}
template <>
DS_D_INLINE __nv_bfloat162 to(__half2 val)
{
return to<__nv_bfloat162>(to<float2>(val));
}
#endif
/********************* To INT64_T Conversions *********************/
template <>
DS_D_INLINE int64_t to(double val)
{
return __double2ll_rn(val);
}
template <>
DS_D_INLINE int64_t to(float val)
{
return __float2ll_rn(val);
}
template <>
DS_D_INLINE int64_t to(__half val)
{
return __half2ll_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE int64_t to(__nv_bfloat16 val)
{
return __bfloat162ll_rn(val);
}
#endif
/********************* To INT32_T Conversions *********************/
template <>
DS_D_INLINE int32_t to(double val)
{
return __double2int_rn(val);
}
template <>
DS_D_INLINE int32_t to(float val)
{
return __float2int_rn(val);
}
template <>
DS_D_INLINE int32_t to(__half val)
{
return __half2int_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE int32_t to(__nv_bfloat16 val)
{
return __bfloat162int_rn(val);
}
#endif
/********************* To INT16_T Conversions *********************/
template <>
DS_D_INLINE int16_t to(double val)
{
return __double2int_rn(val);
}
template <>
DS_D_INLINE int16_t to(float val)
{
return __float2int_rn(val);
}
template <>
DS_D_INLINE int16_t to(__half val)
{
return __half2int_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE int16_t to(__nv_bfloat16 val)
{
return __bfloat162int_rn(val);
}
#endif
/********************* To INT8_T Conversions *********************/
template <>
DS_D_INLINE int8_t to(double val)
{
return __double2int_rn(val);
}
template <>
DS_D_INLINE int8_t to(float val)
{
return __float2int_rn(val);
}
template <>
DS_D_INLINE int8_t to(__half val)
{
return __half2int_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE int8_t to(__nv_bfloat16 val)
{
return __bfloat162int_rn(val);
}
#endif
/********************* To UINT64_T Conversions *********************/
template <>
DS_D_INLINE uint64_t to(double val)
{
return __double2ull_rn(val);
}
template <>
DS_D_INLINE uint64_t to(float val)
{
return __float2ull_rn(val);
}
template <>
DS_D_INLINE uint64_t to(__half val)
{
return __half2ull_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE uint64_t to(__nv_bfloat16 val)
{
return __bfloat162ull_rn(val);
}
#endif
/********************* To UINT32_T Conversions *********************/
template <>
DS_D_INLINE uint32_t to(double val)
{
return __double2uint_rn(val);
}
template <>
DS_D_INLINE uint32_t to(float val)
{
return __float2uint_rn(val);
}
template <>
DS_D_INLINE uint32_t to(__half val)
{
return __half2uint_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE uint32_t to(__nv_bfloat16 val)
{
return __bfloat162uint_rn(val);
}
#endif
/********************* To UINT16_T Conversions *********************/
template <>
DS_D_INLINE uint16_t to(double val)
{
return __double2uint_rn(val);
}
template <>
DS_D_INLINE uint16_t to(float val)
{
return __float2uint_rn(val);
}
template <>
DS_D_INLINE uint16_t to(__half val)
{
return __half2uint_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE uint16_t to(__nv_bfloat16 val)
{
return __bfloat162uint_rn(val);
}
#endif
/********************* To UINT8_T Conversions *********************/
template <>
DS_D_INLINE uint8_t to(double val)
{
return __double2uint_rn(val);
}
template <>
DS_D_INLINE uint8_t to(float val)
{
return __float2uint_rn(val);
}
template <>
DS_D_INLINE uint8_t to(__half val)
{
return __half2uint_rn(val);
}
// No direct support for integer casts at the C++ level and I don't feel they're so important
// to demand an PTX at this time
#ifdef BF16_AVAILABLE
template <>
DS_D_INLINE uint8_t to(__nv_bfloat16 val)
{
return __bfloat162uint_rn(val);
}
#endif
} // namespace conversion
/*
Copyright 2022 The Microsoft DeepSpeed Team
*/
#pragma once
#include "ds_kernel_utils.h"
#include <cuda.h>
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 530
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
......
/*
Copyright 2022 The Microsoft DeepSpeed Team
Centralized header file for preprocessor macros and constants
used throughout the codebase.
*/
#pragma once
#include <cuda.h>
#define DS_HD_INLINE __host__ __device__ __forceinline__
#define DS_D_INLINE __device__ __forceinline__
#ifdef __HIP_PLATFORM_HCC__
// constexpr variant of warpSize for templating
constexpr int hw_warp_size = 64;
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else // !__HIP_PLATFORM_HCC__
// constexpr variant of warpSize for templating
constexpr int hw_warp_size = 32;
#if __CUDA_ARCH__ >= 530
#define HALF_PRECISION_AVAILABLE = 1
#define PTX_AVAILABLE
#endif // __CUDA_ARCH__ >= 530
#if __CUDA_ARCH__ >= 800
#define ASYNC_COPY_AVAILABLE
#define BF16_AVAILABLE
#endif // __CUDA_ARCH__ >= 800
#include <cooperative_groups.h>
#endif //__HIP_PLATFORM_HCC__
......@@ -5,15 +5,7 @@ Copyright 2022 The Microsoft DeepSpeed Team
#pragma once
#include <cuda.h>
#if __CUDA_ARCH__ >= 800
#define ASYNC_COPY_AVAILABLE
#endif
// Discuss basic GPUs to support
#if __CUDA_ARCH__ >= 530
#define PTX_AVAILABLE
#endif
#include "ds_kernel_utils.h"
/////////////////////////////// Memory Access Utils ///////////////////////////////
namespace mem_access {
......
......@@ -2,6 +2,7 @@
Copyright 2022 The Microsoft DeepSpeed Team
*/
#include "conversion_utils.h"
#include "inference_cuda_layers.h"
#include "memory_access_utils.h"
......@@ -16,58 +17,29 @@ inline __device__ float gelu(const float x)
return x * 0.5f * (1.0f + tanhf(sqrt_param * (x + mul_param * x * x * x)));
}
__global__ void fused_bias_gelu(float* input,
const float* bias,
int total_count,
int intermediate_size)
template <typename T>
__global__ void fused_bias_gelu(T* input, const T* bias, int total_count, int intermediate_size)
{
// Input restriction: intermediate_size % vals_per_access == 0
constexpr int granularity = 16;
constexpr int vals_per_access = granularity / sizeof(float);
const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * vals_per_access;
constexpr int values_per_access = granularity / sizeof(T);
const int offset = (blockIdx.x * blockDim.x + threadIdx.x) * values_per_access;
if (offset < total_count) {
float data[vals_per_access];
float data_bias[vals_per_access];
T data[values_per_access];
T data_bias[values_per_access];
mem_access::load_global<granularity>(data, input + offset);
mem_access::load_global<granularity>(data_bias, bias + (offset % intermediate_size));
#pragma unroll
for (int i = 0; i < vals_per_access; i++) { data[i] = gelu(data[i] + data_bias[i]); }
mem_access::store_global<granularity>(input + offset, data);
}
}
__global__ void fused_bias_gelu(__half* input,
const __half* bias,
int total_count,
int intermediate_size)
{
// Input restriction: intermediate_size % vals_per_access == 0
// This kernel doubles the per-thread ALU workload as compared to the float implementation
#ifdef HALF_PRECISION_AVAILABLE
constexpr int granularity = 16;
constexpr int vals_per_access = granularity / sizeof(__half);
int offset = (blockIdx.x * blockDim.x + threadIdx.x) * vals_per_access;
if (offset < total_count) {
// Divide by 2 since we store two values per __half2
__half2 data[vals_per_access / 2];
__half2 bias_data[vals_per_access / 2];
mem_access::load_global<granularity>(data, input + offset);
mem_access::load_global<granularity>(bias_data, bias + (offset % intermediate_size));
#pragma unroll
for (int i = 0; i < vals_per_access / 2; i++) {
float2 data_f = __half22float2(data[i]);
float2 bias_f = __half22float2(bias_data[i]);
data[i] = __floats2half2_rn(gelu(data_f.x + bias_f.x), gelu(data_f.y + bias_f.y));
for (int i = 0; i < values_per_access; i++) {
float data_f = conversion::to<float>(data[i]);
float bias_f = conversion::to<float>(data_bias[i]);
data[i] = conversion::to<T>(gelu(data_f + bias_f));
}
mem_access::store_global<granularity>(input + offset, data);
}
#endif
}
template <typename T>
......
......@@ -16,6 +16,8 @@ Copyright 2022 The Microsoft DeepSpeed Team
#define GIGABYTE (1024 * 1024 * 1024)
#define MAX_OUT_TOKENS 8192
// TODO: refactor out
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
......
......@@ -4,15 +4,7 @@ Copyright 2022 The Microsoft DeepSpeed Team
#pragma once
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 530
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include "ds_kernel_utils.h"
#include <cuda.h>
#include <cuda_fp16.h>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册