sparse_pool_kernel.cu 6.6 KB
Newer Older
Z
zhangkaihuo 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15 16
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"

Z
zhangkaihuo 已提交
17 18
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
19
#include "paddle/phi/core/visit_type.h"
Z
zhangkaihuo 已提交
20 21 22
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
23

Z
zhangkaihuo 已提交
24 25 26
namespace phi {
namespace sparse {

27
template <typename T, typename IntT = int>
Z
zhangkaihuo 已提交
28
__global__ void MaxPoolCudaKernel(const T* in_features_ptr,
29
                                  const IntT* rulebook_ptr,
Z
zhangkaihuo 已提交
30 31 32 33 34 35 36 37
                                  const int n,
                                  const int rulebook_len,
                                  const int channels,
                                  T* out_features_ptr) {
  phi::funcs::MaxPool<T> max_pool_functor;
  CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
    int real_i = i / channels;
    int channel_i = i - real_i * channels;
38 39
    IntT in_i = rulebook_ptr[real_i];
    IntT out_i = rulebook_ptr[real_i + rulebook_len];
Z
zhangkaihuo 已提交
40 41 42 43 44 45 46 47 48 49
    max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
                             &out_features_ptr[out_i * channels + channel_i]);
  }
}

/**
 * x: (N, D, H, W, C)
 * kernel: (D, H, W, C, OC)
 * out: (N, D, H, W, OC)
**/
50 51 52 53 54 55 56 57 58
template <typename T, typename IntT = int>
void MaxPoolGPUKernel(const GPUContext& dev_ctx,
                      const SparseCooTensor& x,
                      const std::vector<int>& kernel_sizes,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
                      SparseCooTensor* out,
                      DenseTensor* rulebook) {
Z
zhangkaihuo 已提交
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
  const auto& x_dims = x.dims();
  int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
  const std::vector<int>& real_kernel_sizes =
      phi::funcs::sparse::PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
  DDim out_dims = {1, 1, 1, 1, 1};
  phi::funcs::sparse::GetOutShape(
      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
  const int in_channels = real_kernel_sizes[3];

  std::vector<int> offsets(kernel_size + 1), counter(kernel_size);
  DenseTensorMeta counter_meta(
      DataType::INT32, {kernel_size}, DataLayout::NCHW);
  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
  DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
  DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
  DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));

  // 1. product rulebook
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
  int rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                                          x,
                                                          real_kernel_sizes,
                                                          paddings,
                                                          dilations,
                                                          strides,
                                                          out_dims,
                                                          false,
                                                          rulebook,
                                                          &counter_per_kernel,
                                                          &offsets_per_kernel,
                                                          &out_index,
                                                          &unique_value,
                                                          out,
                                                          &counter,
                                                          &offsets);

  const IntT* rulebook_ptr = rulebook->data<IntT>();
Z
zhangkaihuo 已提交
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115

  T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
  const T* in_features_ptr = x.non_zero_elements().data<T>();
// 2. max pool
#ifdef PADDLE_WITH_HIP
  thrust::fill(thrust::hip::par.on(dev_ctx.stream()),
#else
  thrust::fill(thrust::cuda::par.on(dev_ctx.stream()),
#endif
               out_features_ptr,
               out_features_ptr + out->non_zero_elements().numel(),
               static_cast<T>(-FLT_MAX));
  // TODO(zhangkaihuo) Replacing multiple calls with one kernel may be faster
  for (int i = 0; i < kernel_size; i++) {
    if (counter[i] <= 0) {
      continue;
    }

    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
        dev_ctx, counter[i] * in_channels, 1);
116 117 118 119
    MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
                                 config.thread_per_block.x,
                                 0,
                                 dev_ctx.stream()>>>(
Z
zhangkaihuo 已提交
120 121 122 123 124 125 126 127 128
        in_features_ptr,
        rulebook_ptr + offsets[i] + rulebook_len,
        counter[i],
        rulebook_len,
        in_channels,
        out_features_ptr);
  }
}

129 130 131 132 133 134 135 136 137
template <typename T, typename Context>
void MaxPoolKernel(const Context& dev_ctx,
                   const SparseCooTensor& x,
                   const std::vector<int>& kernel_sizes,
                   const std::vector<int>& paddings,
                   const std::vector<int>& dilations,
                   const std::vector<int>& strides,
                   SparseCooTensor* out,
                   DenseTensor* rulebook) {
138
  PD_VISIT_INTEGRAL_TYPES(
139 140 141 142 143 144 145 146 147 148 149 150
      x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] {
        MaxPoolGPUKernel<T, data_t>(dev_ctx,
                                    x,
                                    kernel_sizes,
                                    paddings,
                                    dilations,
                                    strides,
                                    out,
                                    rulebook);
      }));
}

Z
zhangkaihuo 已提交
151 152 153 154 155 156 157 158 159 160 161 162
}  // namespace sparse
}  // namespace phi

PD_REGISTER_KERNEL(sparse_maxpool,
                   GPU,
                   ALL_LAYOUT,
                   phi::sparse::MaxPoolKernel,
                   float,
                   double,
                   phi::dtype::float16) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}