flash_attn_kernel.cu 8.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/phi/kernels/flash_attn_kernel.h"

#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
C
Chitsing KUI 已提交
19
#include "paddle/phi/core/enforce.h"
20 21 22 23 24 25 26 27 28 29 30 31 32 33
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"

#include "paddle/phi/kernels/arange_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/reshape_kernel.h"

#ifdef PADDLE_WITH_FLASHATTN
#include "paddle/phi/backends/dynload/flashattn.h"
#endif

namespace phi {

template <typename T, typename Context>
34 35 36 37 38 39 40 41 42 43 44 45
void FlashAttnUnpaddedKernel(const Context& ctx,
                             const DenseTensor& q,
                             const DenseTensor& k,
                             const DenseTensor& v,
                             const DenseTensor& cu_seqlens_q,
                             const DenseTensor& cu_seqlens_k,
                             int64_t max_seqlen_q,
                             int64_t max_seqlen_k,
                             float scale,
                             float dropout,
                             bool causal,
                             bool return_softmax,
S
sneaxiy 已提交
46
                             bool is_test,
47 48 49 50
                             DenseTensor* out,
                             DenseTensor* softmax,
                             DenseTensor* softmax_lse,
                             DenseTensor* seed_offset) {
51
#ifdef PADDLE_WITH_FLASHATTN
S
sneaxiy 已提交
52 53
  if (is_test) dropout = 0.0f;

54 55 56 57 58
  ctx.template Alloc<T>(out);

  cudaStream_t stream = ctx.stream();
  bool is_bf16 = q.dtype() == DataType::BFLOAT16 ? true : false;

C
Chitsing KUI 已提交
59
  // q,k,v [total_*, num_heads, head_dim]
60 61

  auto dims = q.dims();
C
Chitsing KUI 已提交
62 63 64 65 66
  PADDLE_ENFORCE_EQ(
      dims.size(),
      3,
      phi::errors::InvalidArgument("flash_attn_raw receive input with dim "
                                   "[total_seq_len, num_heads, head_dim]"));
67

C
Chitsing KUI 已提交
68 69 70
  int64_t total_q = dims[0];
  int64_t num_heads = dims[1];
  int64_t head_size = dims[2];
71

C
Chitsing KUI 已提交
72 73
  int64_t total_k = k.dims()[0];
  int64_t batch_size = cu_seqlens_q.numel() - 1;
74 75 76 77 78 79 80

  int num_splits = 0;  // 0 for an internal heuristic, which is optimal
  bool zero_tensors = false;

  auto gen = ctx.GetGenerator();
  uint64_t inc = batch_size * num_heads * 32;
  auto seed_offset_pair = gen->IncrementOffset(inc);
81

82 83 84
  uint64_t seed = seed_offset_pair.first;
  uint64_t offset = seed_offset_pair.second;

85 86 87 88
  seed_offset->Resize({2});
  auto* seed_offset_data = ctx.template HostAlloc<int64_t>(seed_offset);
  seed_offset_data[0] = static_cast<int64_t>(seed);
  seed_offset_data[1] = static_cast<int64_t>(offset);
89

C
Chitsing KUI 已提交
90 91
  int64_t seq_len_q = ((max_seqlen_q + 16 - 1) / 16) * 16;

92 93 94 95
  softmax_lse->Resize({batch_size, num_heads, seq_len_q});
  ctx.template Alloc<float>(softmax_lse);

  if (return_softmax) {
C
Chitsing KUI 已提交
96
    // may allocate more space than *max_seqlen_k*
97
    int64_t blocksize_c = head_size > 64 ? 128 : 256;
C
Chitsing KUI 已提交
98 99 100 101 102 103 104 105
    int64_t seq_len_k =
        ((max_seqlen_k + blocksize_c - 1) / blocksize_c) * blocksize_c;
    if (max_seqlen_k <= 128) {
      seq_len_k = 128;
    } else if (max_seqlen_k <= 256) {
      seq_len_k = 256;
    }
    softmax->Resize({batch_size, num_heads, seq_len_q, seq_len_k});
106 107 108 109 110
    ctx.template Alloc<T>(softmax);
  }

  uint64_t workspace_size;

C
Chitsing KUI 已提交
111
  // TODO(kuizhiqing) pass allocation/empty func in capi to decouple
112 113
  // calculate workspace size before execution
  bool succ =
C
Chitsing KUI 已提交
114 115 116
      phi::dynload::flash_attn_fwd(q.data(),
                                   k.data(),
                                   v.data(),
117 118 119 120 121 122 123 124
                                   nullptr,  // for calculation workspace size
                                   cu_seqlens_q.data(),
                                   cu_seqlens_k.data(),
                                   total_q,
                                   total_k,
                                   batch_size,
                                   num_heads,
                                   head_size,
C
Chitsing KUI 已提交
125 126
                                   max_seqlen_q,
                                   max_seqlen_k,
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
                                   dropout,
                                   scale,
                                   zero_tensors,
                                   causal,
                                   is_bf16,
                                   num_splits,
                                   softmax_lse->data(),
                                   return_softmax ? softmax->data() : nullptr,
                                   nullptr,
                                   &workspace_size,
                                   stream,
                                   seed,
                                   offset);

  if (!succ) {
    PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error()));
  }

  DenseTensor workspace;
  if (workspace_size > 0) {
    workspace = Empty<float>(ctx, {int64_t(workspace_size / sizeof(float))});
  }

  succ = phi::dynload::flash_attn_fwd(
C
Chitsing KUI 已提交
151 152 153
      q.data(),
      k.data(),
      v.data(),
154 155 156 157 158 159 160 161
      out->data(),
      cu_seqlens_q.data(),
      cu_seqlens_k.data(),
      total_q,
      total_k,
      batch_size,
      num_heads,
      head_size,
C
Chitsing KUI 已提交
162 163
      max_seqlen_q,
      max_seqlen_k,
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
      dropout,
      scale,
      zero_tensors,
      causal,
      is_bf16,
      num_splits,
      softmax_lse->data(),
      return_softmax ? softmax->data() : nullptr,
      workspace_size > 0 ? workspace.data() : nullptr,
      &workspace_size,
      stream,
      seed,
      offset);

  if (!succ) {
    PADDLE_THROW(phi::errors::External(phi::dynload::flash_attn_error()));
  }

#endif
}

C
Chitsing KUI 已提交
185 186 187 188 189 190 191 192
template <typename T, typename Context>
void FlashAttnKernel(const Context& ctx,
                     const DenseTensor& q,
                     const DenseTensor& k,
                     const DenseTensor& v,
                     float dropout,
                     bool causal,
                     bool return_softmax,
S
sneaxiy 已提交
193
                     bool is_test,
C
Chitsing KUI 已提交
194 195
                     DenseTensor* out,
                     DenseTensor* softmax,
196
                     DenseTensor* softmax_lse,
C
Chitsing KUI 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
                     DenseTensor* seed_offset) {
#ifdef PADDLE_WITH_FLASHATTN
  // q,k,v [batch_size, seq_len, num_heads, head_dim]

  auto dims = q.dims();
  PADDLE_ENFORCE_EQ(dims.size(),
                    4,
                    phi::errors::InvalidArgument(
                        "flash_attn receive input with dim "
                        "[batch_size, seq_len, num_heads, head_dim]"));

  int64_t batch_size = dims[0];
  int64_t seq_len_q = dims[1];
  int64_t num_heads = dims[2];
  int64_t head_size = dims[3];

  int64_t seq_len_k = k.dims()[1];

  int64_t total_q = batch_size * seq_len_q;
  int64_t total_k = batch_size * seq_len_k;

  float scale = 1.0f / std::sqrt(head_size);

220 221 222 223
  DenseTensor q_t_s, k_t_s, v_t_s;
  q_t_s.ShareDataWith(q).Resize({total_q, num_heads, head_size});
  k_t_s.ShareDataWith(k).Resize({total_k, num_heads, head_size});
  v_t_s.ShareDataWith(v).Resize({total_k, num_heads, head_size});
C
Chitsing KUI 已提交
224 225 226 227 228 229 230 231

  DenseTensor cu_seqlens_q;
  DenseTensor cu_seqlens_k;
  ArangeNullaryKernel<int32_t, Context>(
      ctx, 0, (batch_size + 1) * seq_len_q, seq_len_q, &cu_seqlens_q);
  ArangeNullaryKernel<int32_t, Context>(
      ctx, 0, (batch_size + 1) * seq_len_k, seq_len_k, &cu_seqlens_k);

232 233 234 235 236 237 238 239 240 241 242 243
  FlashAttnUnpaddedKernel<T, Context>(ctx,
                                      q_t_s,
                                      k_t_s,
                                      v_t_s,
                                      cu_seqlens_q,
                                      cu_seqlens_k,
                                      seq_len_q,
                                      seq_len_k,
                                      scale,
                                      dropout,
                                      causal,
                                      return_softmax,
S
sneaxiy 已提交
244
                                      is_test,
245 246 247 248
                                      out,
                                      softmax,
                                      softmax_lse,
                                      seed_offset);
C
Chitsing KUI 已提交
249 250 251 252

#endif
}

253 254
}  // namespace phi

255
PD_REGISTER_KERNEL(flash_attn_unpadded,
C
Chitsing KUI 已提交
256 257
                   GPU,
                   ALL_LAYOUT,
258
                   phi::FlashAttnUnpaddedKernel,
C
Chitsing KUI 已提交
259 260 261
                   phi::dtype::float16,
                   phi::dtype::bfloat16) {}

262 263 264 265 266 267
PD_REGISTER_KERNEL(flash_attn,
                   GPU,
                   ALL_LAYOUT,
                   phi::FlashAttnKernel,
                   phi::dtype::float16,
                   phi::dtype::bfloat16) {}