sparse_utils_kernel.cu 22.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <thrust/execution_policy.h>
#include <thrust/remove.h>

18
#include "paddle/phi/backends/gpu/gpu_context.h"
19
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
20
#include "paddle/phi/core/enforce.h"
21 22
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
23
#include "paddle/phi/core/visit_type.h"
24
#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
25
#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
26

27
namespace phi {
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
namespace sparse {

template <typename T>
inline __device__ bool DevIsZero(const T* data, const int64_t cols) {
  const T zero = static_cast<T>(0);
  // TODO(zhangkaihuo): check the data is zero or not in parallen when cols > 1
  for (int64_t i = 0; i < cols; i++) {
    if (data[i] != zero) {
      return false;
    }
  }
  return true;
}

template <typename T>
__global__ void GetNonZeroNums(const T* dense_data,
                               const int rows,
                               const int cols,
                               int* non_zero_num,
                               int* temp_indexs) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  __shared__ int counter;
  if (threadIdx.x == 0) counter = 0;
  __syncthreads();

  for (int i = tid; i < rows; i += gridDim.x * blockDim.x) {
    int index = -1;
    // TODO(zhangkaihuo): when cols=1, vectorization can be used
    if (!DevIsZero(dense_data + i * cols, cols)) {
      // use reductions?
      atomicAdd(&counter, 1);
      index = i;
    }
    temp_indexs[i] = index;
  }
  __syncthreads();
  if (threadIdx.x == 0) {
    atomicAdd(non_zero_num, counter);
  }
}

template <typename T>
__global__ void GetNonZeroElementsAndIndices(const T* dense_data,
                                             const int64_t sparse_dim,
                                             const int64_t cols,
                                             const int64_t* x_dims,
                                             const int non_zero_num,
                                             const int* indexs,
                                             int64_t* indices,
                                             T* sparse_data) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
    int64_t sparse_index = indexs[i];
    int64_t x_index = sparse_index;
    for (int64_t j = sparse_dim - 1; j >= 0; j--) {
      indices[j * non_zero_num + i] = sparse_index % x_dims[j];
      sparse_index /= x_dims[j];
    }

    for (int j = 0; j < cols; j++) {
      sparse_data[i * cols + j] = dense_data[x_index * cols + j];
    }
  }
}

template <typename T, typename Context>
void DenseToSparseCooKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const int64_t sparse_dim,
                            SparseCooTensor* out) {
  const T* x_data = x.data<T>();
  const auto& x_dims = x.dims();
100 101 102 103 104 105
  PADDLE_ENFORCE_LE(sparse_dim,
                    x_dims.size(),
                    phi::errors::InvalidArgument(
                        "sparse_dim must be less than the size of x.dims()"));
  PADDLE_ENFORCE_GT(
      sparse_dim, 0, phi::errors::InvalidArgument("sparse_dim must be >0"));
106 107 108
  auto dims_2d = flatten_to_2d(x_dims, sparse_dim);
  const int rows = dims_2d[0];
  const int cols = dims_2d[1];
109 110
  DenseTensor nums = phi::Empty<int32_t>(dev_ctx, {1});
  DenseTensor d_x_dims = phi::Empty<int64_t>(dev_ctx, {x_dims.size()});
111 112

  // 1. get numbers of non zero elements, and get the index of non zero elements
113 114 115
  int* nums_ptr = nums.data<int>();
  phi::backends::gpu::GpuMemsetAsync(
      nums_ptr, 0, sizeof(int), dev_ctx.stream());
116
  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
117

118 119 120
  DenseTensor temp_indexs = phi::Empty<int32_t>(dev_ctx, {rows});
  int* temp_indexs_ptr = temp_indexs.data<int>();

121 122 123 124
  GetNonZeroNums<<<config.block_per_grid.x,
                   config.thread_per_block.x,
                   0,
                   dev_ctx.stream()>>>(
125
      x_data, rows, cols, nums_ptr, temp_indexs_ptr);
126

127 128 129 130 131 132 133 134 135 136 137
#ifdef PADDLE_WITH_HIP
  thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else
  thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif
                 temp_indexs_ptr,
                 temp_indexs_ptr + rows,
                 -1);

  // 2. copy non_zero_num to host, copy x_dims to device
  int non_zero_num = 0;
138 139 140 141 142 143 144 145 146 147
  phi::backends::gpu::GpuMemcpyAsync(&non_zero_num,
                                     nums_ptr,
                                     sizeof(int),
                                     gpuMemcpyDeviceToHost,
                                     dev_ctx.stream());
  phi::backends::gpu::GpuMemcpyAsync(d_x_dims.data<int64_t>(),
                                     x_dims.Get(),
                                     x_dims.size() * sizeof(x_dims[0]),
                                     gpuMemcpyHostToDevice,
                                     dev_ctx.stream());
148 149 150

  dev_ctx.Wait();  // wait the copy

151 152
  const auto values_dims =
      phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
Z
zyfncg 已提交
153 154 155 156 157 158
  phi::DenseTensor indices = phi::Empty<int64_t>(
      dev_ctx, {sparse_dim, static_cast<int64_t>(non_zero_num)});
  int64_t* indices_data = indices.data<int64_t>();
  phi::DenseTensor values;
  values.Resize(values_dims);
  T* sparse_data = dev_ctx.template Alloc<T>(&values);
159 160

  // 3. calc indices by indexs and get values by indexs
161 162 163 164 165 166 167 168 169 170 171 172
  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
  GetNonZeroElementsAndIndices<<<config.block_per_grid.x,
                                 config.thread_per_block.x,
                                 0,
                                 dev_ctx.stream()>>>(x_data,
                                                     sparse_dim,
                                                     cols,
                                                     d_x_dims.data<int64_t>(),
                                                     non_zero_num,
                                                     temp_indexs_ptr,
                                                     indices_data,
                                                     sparse_data);
173 174 175
  out->SetMember(indices, values, x_dims, true);
}

176 177
template <typename IntT>
__global__ void GetBatchSizes(const IntT* crows,
178 179
                              const int rows,
                              const int batchs,
180
                              IntT* batch_sizes) {
181 182 183 184 185 186
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  if (tid < batchs) {
    batch_sizes[tid] = crows[tid * (rows + 1) + rows];
  }
}

187 188 189 190 191
template <typename IntT>
__global__ void ConvertCsrCrowsToCooRows(const IntT* crows_ptr,
                                         const IntT* crows_offsets,
                                         IntT* rows_ptr,
                                         IntT* batch_ptr,
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
                                         const int rows) {
  const int b = blockIdx.y;
  const int64_t offset = crows_offsets ? crows_offsets[b] : 0;
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  for (int i = tid; i < rows; i += gridDim.x * blockDim.x) {
    for (int j = crows_ptr[b * (rows + 1) + i];
         j < crows_ptr[b * (rows + 1) + i + 1];
         j++) {
      rows_ptr[offset + j] = i;
      if (batch_ptr) {
        batch_ptr[offset + j] = b;
      }
    }
  }
}

208 209 210 211
template <typename T, typename IntT>
void SparseCsrToCooGPUKernel(const GPUContext& dev_ctx,
                             const SparseCsrTensor& x,
                             SparseCooTensor* out) {
212 213 214 215 216
  const DDim& x_dims = x.dims();
  const int64_t non_zero_num = x.non_zero_cols().numel();
  const auto& csr_crows = x.non_zero_crows();
  const auto& csr_cols = x.non_zero_cols();
  const auto& csr_values = x.non_zero_elements();
217 218
  const IntT* csr_crows_data = csr_crows.data<IntT>();
  const IntT* csr_cols_data = csr_cols.data<IntT>();
219 220 221 222 223 224 225 226 227
  const T* csr_values_data = csr_values.data<T>();

  int64_t sparse_dim = 2;
  if (x_dims.size() == 3) {
    sparse_dim = 3;
  }
  int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];

228 229 230 231 232 233
  DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
  DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, csr_values);
  DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batchs});
  IntT* coo_indices = indices.data<IntT>();
  IntT* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
  IntT* coo_rows_data =
234
      x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
235 236 237
  IntT* coo_cols_data = coo_rows_data + non_zero_num;
  IntT* offsets_ptr = batchs == 1 ? nullptr : offsets.data<IntT>();
  T* coo_values_data = values.data<T>();
238 239

  if (batchs > 1) {
240
    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
241
    GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
242 243 244 245 246 247 248 249 250 251 252 253
        csr_crows_data, rows, batchs, offsets_ptr);

#ifdef PADDLE_WITH_HIP
    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
                           offsets_ptr,
                           offsets_ptr + batchs,
                           offsets_ptr);
  }

254 255
  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
  config.block_per_grid.y = batchs;
256 257 258 259 260 261 262 263 264 265 266 267 268 269
  ConvertCsrCrowsToCooRows<IntT>
      <<<config.block_per_grid, config.thread_per_block.x>>>(
          csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);

  phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
                                     csr_cols_data,
                                     sizeof(IntT) * non_zero_num,
                                     gpuMemcpyDeviceToDevice,
                                     dev_ctx.stream());
  phi::backends::gpu::GpuMemcpyAsync(coo_values_data,
                                     csr_values_data,
                                     sizeof(T) * non_zero_num,
                                     gpuMemcpyDeviceToDevice,
                                     dev_ctx.stream());
270 271 272 273

  out->SetMember(indices, values, x_dims, true);
}

274 275 276 277 278 279 280 281 282 283 284 285
template <typename T, typename Context>
void SparseCsrToCooKernel(const Context& dev_ctx,
                          const SparseCsrTensor& x,
                          SparseCooTensor* out) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_crows().dtype(), "SparseCsrToCooGPUKernel", ([&] {
        SparseCsrToCooGPUKernel<T, data_t>(dev_ctx, x, out);
      }));
}

template <typename IntT>
__global__ void GetBatchsOffset(const IntT* batchs_ptr,
286
                                const int non_zero_num,
287
                                IntT* batchs_offset) {
288 289 290 291 292 293 294 295
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
    if (i == non_zero_num - 1 || batchs_ptr[i] != batchs_ptr[i + 1]) {
      batchs_offset[batchs_ptr[i]] = i + 1;
    }
  }
}

296
template <typename IntT>
297
__global__ void ConvertCooRowsToCsrCrows(
298 299 300
    const IntT* batchs_offset,  // can be null if batchs = 1
    const IntT* coo_rows_data,
    IntT* csr_crows_data,
301 302 303 304 305 306
    const int rows,
    const int64_t non_zero_num) {
  const int b = blockIdx.y;
  int batch_non_zero_num =
      batchs_offset == nullptr ? non_zero_num : batchs_offset[b];
  if (batch_non_zero_num == 0) return;
307
  IntT batch_start = 0;
308 309 310 311
  if (b > 0) {
    batch_start = batchs_offset[b - 1];
    batch_non_zero_num -= batch_start;
  }
312
  const IntT* coo_rows_ptr = coo_rows_data + batch_start;
313 314 315
  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
  for (int i = tid; i < batch_non_zero_num; i += gridDim.x * blockDim.x) {
    if (i == 0) {
316
      for (IntT j = 0; j <= coo_rows_ptr[0]; j++) {
317 318 319
        csr_crows_data[b * (rows + 1) + j] = 0;
      }
    } else {
320
      for (IntT j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) {
321 322 323 324
        csr_crows_data[b * (rows + 1) + j + 1] = i;
      }
    }
    if (i == batch_non_zero_num - 1) {
325
      for (IntT i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1;
326 327 328 329 330 331 332
           i++) {
        csr_crows_data[b * (rows + 1) + i] = batch_non_zero_num;
      }
    }
  }
}

333 334 335 336
template <typename T, typename IntT>
void SparseCooToCsrGPUKernel(const GPUContext& dev_ctx,
                             const SparseCooTensor& x,
                             SparseCsrTensor* out) {
337 338 339 340
  const auto& x_dims = x.dims();
  bool valid = x_dims.size() == 2 || x_dims.size() == 3;
  PADDLE_ENFORCE_EQ(valid,
                    true,
341
                    phi::errors::InvalidArgument(
342 343 344 345 346 347 348
                        "SparseCsrTensor only support 2-D or 3-D matrix"));
  const int64_t non_zero_num = x.nnz();
  if (non_zero_num <= 0) return;

  int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
  int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];

Z
zyfncg 已提交
349
  phi::DenseTensor non_zero_crows =
350 351 352 353 354 355
      phi::Empty<IntT>(dev_ctx, {batchs * (rows + 1)});
  phi::DenseTensor non_zero_cols = phi::Empty<IntT>(dev_ctx, {non_zero_num});
  phi::DenseTensor non_zero_elements =
      phi::EmptyLike<T, GPUContext>(dev_ctx, x.non_zero_elements());
  IntT* csr_crows_data = non_zero_crows.data<IntT>();
  IntT* csr_cols_data = non_zero_cols.data<IntT>();
Z
zyfncg 已提交
356
  T* csr_values_data = non_zero_elements.data<T>();
357 358 359

  const auto& coo_indices = x.non_zero_indices();
  const auto& coo_values = x.non_zero_elements();
360 361
  const IntT* batchs_ptr = coo_indices.data<IntT>();
  const IntT* coo_rows_data =
362
      batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num;
363
  const IntT* coo_cols_data = coo_rows_data + non_zero_num;
364 365
  const T* coo_values_data = coo_values.data<T>();

366
  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
367
  if (batchs > 1) {
368 369 370 371 372 373 374
    phi::DenseTensor batchs_offset = phi::Empty<IntT>(dev_ctx, {batchs});
    IntT* batchs_offset_ptr = batchs_offset.data<IntT>();
    GetBatchsOffset<IntT>
        <<<config.block_per_grid.x,
           config.thread_per_block.x,
           0,
           dev_ctx.stream()>>>(batchs_ptr, non_zero_num, batchs_offset_ptr);
375
    config.block_per_grid.y = batchs;
376 377 378 379
    ConvertCooRowsToCsrCrows<IntT><<<config.block_per_grid,
                                     config.thread_per_block.x,
                                     0,
                                     dev_ctx.stream()>>>(
380 381
        batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
  } else {
382 383 384 385
    ConvertCooRowsToCsrCrows<IntT><<<config.block_per_grid.x,
                                     config.thread_per_block.x,
                                     0,
                                     dev_ctx.stream()>>>(
386 387 388
        nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
  }

389 390 391 392 393 394 395 396 397 398
  phi::backends::gpu::GpuMemcpyAsync(csr_cols_data,
                                     coo_cols_data,
                                     sizeof(IntT) * non_zero_num,
                                     gpuMemcpyDeviceToDevice,
                                     dev_ctx.stream());
  phi::backends::gpu::GpuMemcpyAsync(csr_values_data,
                                     coo_values_data,
                                     sizeof(T) * non_zero_num,
                                     gpuMemcpyDeviceToDevice,
                                     dev_ctx.stream());
399 400 401
  out->SetMember(non_zero_crows, non_zero_cols, non_zero_elements, x_dims);
}

402 403 404 405 406 407 408 409 410 411
template <typename T, typename Context>
void SparseCooToCsrKernel(const Context& dev_ctx,
                          const SparseCooTensor& x,
                          SparseCsrTensor* out) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "SparseCooToCsrGPUKernel", ([&] {
        SparseCooToCsrGPUKernel<T, data_t>(dev_ctx, x, out);
      }));
}

Z
zhangkaihuo 已提交
412 413
template <typename ValueT, typename IndicesT>
__global__ void KernelSparseCooToDense(const IndicesT* indices,
414
                                       const int64_t* sparse_offsets,
Z
zhangkaihuo 已提交
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
                                       const ValueT* data,
                                       ValueT* dense_data,
                                       const IndicesT non_zero_num,
                                       const int64_t base_offset,
                                       const int64_t sparse_dim) {
  int tid = threadIdx.x + blockIdx.x * blockDim.x;
  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
    int64_t index = 0;
    for (int j = 0; j < sparse_dim; j++) {
      index += indices[j * non_zero_num + i] * sparse_offsets[j];
    }

    for (int j = 0; j < base_offset; j++) {
      dense_data[index * base_offset + j] = data[i * base_offset + j];
    }
  }
}

433 434 435 436
template <typename T, typename IntT>
void SparseCooToDenseGPUKernel(const GPUContext& dev_ctx,
                               const SparseCooTensor& x,
                               DenseTensor* out) {
Z
zhangkaihuo 已提交
437 438 439 440 441 442 443 444 445 446 447 448 449
  const auto non_zero_num = x.nnz();
  const auto dense_dims = x.dims();
  const auto indices = x.non_zero_indices();
  const auto values = x.non_zero_elements();
  const auto indices_dims = indices.dims();
  int64_t sparse_dim = indices_dims[0];
  if (indices_dims.size() == 1) {
    sparse_dim = 1;
  }
  const int64_t dense_dim = values.dims().size() - 1;

  const auto place = dev_ctx.GetPlace();
  const T* x_data = values.data<T>();
Z
zhangkaihuo 已提交
450 451 452 453
  *out = phi::Empty(dev_ctx,
                    phi::DenseTensorMeta(
                        x.dtype(), x.dims(), x.non_zero_elements().layout()));
  T* out_data = out->data<T>();
Z
zhangkaihuo 已提交
454 455 456 457 458 459 460 461 462 463 464
  int64_t base_offset = 1;
  for (int64_t i = 0; i < dense_dim; i++) {
    base_offset *= dense_dims[sparse_dim + i];
  }
  std::vector<int64_t> sparse_offsets(sparse_dim);
  int64_t offset = 1;
  for (int i = sparse_dim - 1; i >= 0; i--) {
    sparse_offsets[i] = offset;
    offset *= dense_dims[i];
  }

465 466 467 468 469 470 471 472 473
  DenseTensor d_sparse_offsets = Empty<int64_t>(dev_ctx, {sparse_dim});

  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<int64_t>(),
                                     sparse_offsets.data(),
                                     sparse_dim * sizeof(int64_t),
                                     gpuMemcpyHostToDevice,
                                     dev_ctx.stream());
  phi::backends::gpu::GpuMemsetAsync(
      out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream());
Z
zhangkaihuo 已提交
474

475 476
  auto config =
      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
Z
zhangkaihuo 已提交
477

478
  KernelSparseCooToDense<T, IntT>
479 480 481
      <<<config.block_per_grid.x,
         config.thread_per_block.x,
         0,
482
         dev_ctx.stream()>>>(indices.data<IntT>(),
483 484 485 486 487 488
                             d_sparse_offsets.data<int64_t>(),
                             x_data,
                             out_data,
                             non_zero_num,
                             base_offset,
                             sparse_dim);
Z
zhangkaihuo 已提交
489 490
}

491 492 493 494 495 496 497 498 499 500
template <typename T, typename Context>
void SparseCooToDenseKernel(const Context& dev_ctx,
                            const SparseCooTensor& x,
                            DenseTensor* out) {
  PD_VISIT_INTEGRAL_TYPES(
      x.non_zero_indices().dtype(), "SparseCooToDenseGPUKernel", ([&] {
        SparseCooToDenseGPUKernel<T, data_t>(dev_ctx, x, out);
      }));
}

501
}  // namespace sparse
502
}  // namespace phi
503

504
PD_REGISTER_KERNEL(dense_to_sparse_coo,
505 506
                   GPU,
                   ALL_LAYOUT,
507
                   phi::sparse::DenseToSparseCooKernel,
508 509
                   float,
                   double,
510
                   phi::dtype::float16,
511 512 513 514 515
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {}
516

517
PD_REGISTER_KERNEL(sparse_csr_to_coo,
518 519
                   GPU,
                   ALL_LAYOUT,
520
                   phi::sparse::SparseCsrToCooKernel,
521 522
                   float,
                   double,
523
                   phi::dtype::float16,
524 525 526 527 528
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {}
529

530
PD_REGISTER_KERNEL(sparse_coo_to_csr,
531 532
                   GPU,
                   ALL_LAYOUT,
533
                   phi::sparse::SparseCooToCsrKernel,
534 535
                   float,
                   double,
536
                   phi::dtype::float16,
537 538 539 540 541 542
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {}

543
PD_REGISTER_KERNEL(dense_to_sparse_csr,
544 545
                   GPU,
                   ALL_LAYOUT,
546
                   phi::sparse::DenseToSparseCsrKernel,
547 548
                   float,
                   double,
549
                   phi::dtype::float16,
550 551 552 553 554
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {}
Z
zhangkaihuo 已提交
555

556
PD_REGISTER_KERNEL(sparse_coo_to_dense,
Z
zhangkaihuo 已提交
557 558
                   GPU,
                   ALL_LAYOUT,
559
                   phi::sparse::SparseCooToDenseKernel,
Z
zhangkaihuo 已提交
560 561
                   float,
                   double,
562
                   phi::dtype::float16,
Z
zhangkaihuo 已提交
563 564 565 566 567 568
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {}

569
PD_REGISTER_KERNEL(sparse_csr_to_dense,
Z
zhangkaihuo 已提交
570 571
                   GPU,
                   ALL_LAYOUT,
572
                   phi::sparse::SparseCsrToDenseKernel,
Z
zhangkaihuo 已提交
573 574
                   float,
                   double,
575
                   phi::dtype::float16,
Z
zhangkaihuo 已提交
576 577 578 579 580
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {}
581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610

PD_REGISTER_KERNEL(coo_values,
                   GPU,
                   ALL_LAYOUT,
                   phi::sparse::CooValuesKernel,
                   float,
                   double,
                   phi::dtype::float16,
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}

PD_REGISTER_KERNEL(csr_values,
                   GPU,
                   ALL_LAYOUT,
                   phi::sparse::CsrValuesKernel,
                   float,
                   double,
                   phi::dtype::float16,
                   uint8_t,
                   int8_t,
                   int16_t,
                   int,
                   int64_t) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}
611 612 613 614 615 616 617 618 619 620 621 622

PD_REGISTER_KERNEL(sparse_coo_tensor,
                   GPU,
                   ALL_LAYOUT,
                   phi::sparse::SparseCooTensorKernel,
                   float,
                   double,
                   phi::dtype::float16,
                   uint8_t,
                   int16_t,
                   int,
                   int64_t) {}