conv_kernel.cu 9.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangkaihuo 已提交
15
#include "paddle/phi/kernels/sparse/conv_kernel.h"
16

17 18 19
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
20
#include "paddle/phi/core/visit_type.h"
21
#include "paddle/phi/kernels/funcs/blas/blas.h"
22
#include "paddle/phi/kernels/funcs/scatter.cu.h"
23
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
24 25 26
#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"

#include "glog/logging.h"
27 28 29 30

namespace phi {
namespace sparse {

31
template <typename T, typename IntT>
Z
zhangkaihuo 已提交
32 33 34 35 36 37 38 39
void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                        const SparseCooTensor& x,
                        const DenseTensor& kernel,
                        const std::vector<int>& paddings,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
40
                        const std::string& key,
Z
zhangkaihuo 已提交
41
                        SparseCooTensor* out,
42 43
                        DenseTensor* rulebook,
                        DenseTensor* counter) {
44 45 46 47 48 49 50
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
  const auto& x_dims = x.dims();
  const auto& kernel_dims = kernel.dims();
  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  DDim out_dims = {1, 1, 1, 1, 1};
Z
zhangkaihuo 已提交
51 52 53 54
  std::vector<int> kernel_sizes(kernel_dims.size());
  for (int i = 0; i < kernel_dims.size(); i++) {
    kernel_sizes[i] = kernel_dims[i];
  }
55 56 57 58 59 60 61 62 63

  std::vector<int> subm_paddings(paddings), subm_strides(strides);
  if (subm) {
    // the out shape of subm_conv is same as input shape
    // reset the padding=kernel_size/2 and strides=1
    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
        kernel.dims(), &subm_paddings, &subm_strides);
  }

64
  phi::funcs::sparse::GetOutShape(
65
      x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
66 67
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
68 69 70 71 72
  DenseTensor h_counter, h_offsets;
  h_counter.Resize({kernel_size});
  h_offsets.Resize({kernel_size + 1});
  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
73 74 75 76

  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});

  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
  int rulebook_len = 0;
  const IntT* rulebook_ptr = nullptr;
  bool need_product_rulebook = true;
  if (subm && !key.empty()) {
    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
        dev_ctx,
        x,
        key,
        out_dims,
        out,
        h_counter.data<int>(),
        h_offsets.data<int>(),
        &rulebook_len,
        &need_product_rulebook);
  }

  if (need_product_rulebook) {
    DenseTensor tmp_rulebook;
    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                                        x,
                                                        kernel_sizes,
                                                        subm_paddings,
                                                        dilations,
                                                        subm_strides,
                                                        out_dims,
                                                        subm,
                                                        &tmp_rulebook,
                                                        &counter_per_kernel,
                                                        &offsets_per_kernel,
                                                        &out_index,
                                                        &unique_value,
                                                        out,
                                                        h_counter_ptr,
                                                        h_offsets_ptr);
    rulebook_ptr = tmp_rulebook.data<IntT>();

    phi::funcs::sparse::SaveToTable(
        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
  }
122 123 124

  // 2. gather
  phi::DenseTensor in_features =
125
      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
126
  phi::DenseTensor out_features =
127
      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
128 129
  T* in_features_ptr = in_features.data<T>();
  T* out_features_ptr = out_features.data<T>();
130
  phi::funcs::SetConstant<GPUContext, T> set_zero;
Z
zhangkaihuo 已提交
131
  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
132

133 134 135 136 137 138
  Gather<T, IntT>(dev_ctx,
                  x.non_zero_elements().data<T>(),
                  rulebook_ptr,
                  rulebook_len,
                  in_channels,
                  in_features_ptr);
139 140

  // 3. call gemm for every werght
141
  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
142 143
  auto* out_values = out->mutable_non_zero_elements();
  T* out_values_ptr = out_values->data<T>();
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
  set_zero(dev_ctx, out_values, static_cast<T>(0.0f));

  if (subm) {
    auto config =
        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
    unique_value.ResizeAndAllocate(
        {static_cast<int>(out->nnz() * kernel_size)});
    out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
    int* out_index_ptr = out_index.data<int>();
    int* unique_value_ptr = unique_value.data<int>();
    phi::backends::gpu::GpuMemsetAsync(
        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
    GroupIndexs<<<config.block_per_grid,
                  config.thread_per_block,
                  0,
                  dev_ctx.stream()>>>(rulebook_len,
                                      kernel_size,
                                      rulebook_ptr + rulebook_len,
                                      out_index_ptr,
                                      unique_value_ptr);
  }
165 166 167

  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
168
    if (h_counter_ptr[i] <= 0) {
169 170 171 172
      continue;
    }

    // call gemm: (n, in_channels) * (in_channels, out_channels)
173
    const int M = h_counter_ptr[i];
174 175
    const int K = in_channels;
    const int N = out_channels;
176
    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
177
    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
178
    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
179 180 181 182 183 184 185 186 187 188 189 190 191 192

    blas.GEMM(CblasNoTrans,
              CblasNoTrans,
              M,
              N,
              K,
              static_cast<T>(1),
              tmp_in_ptr,
              tmp_kernel_ptr,
              static_cast<T>(0),
              tmp_out_ptr);
  }

  // 4. scatter
193 194 195 196 197 198 199 200 201
  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
                                   out_features_ptr,
                                   out_index.data<int>(),
                                   unique_value.data<int>(),
                                   out->nnz(),
                                   kernel_size,
                                   out_channels,
                                   1,
                                   out_values_ptr);
202
}
203

204
/**
205 206 207 208 209
 * x: the input SparseCooTensor, shape is (N, D, H, W, C)
 * kernel: the weight data, shape is (D, H, W, C, OC)
 * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
 * rulebook: return rulebook if key is not vailed else return nullptr
 * counter: return counter if key is not vailed else return nullptr
210
 **/
211
template <typename T, typename Context>
Z
zhangkaihuo 已提交
212 213 214 215 216 217 218 219
void Conv3dCooKernel(const Context& dev_ctx,
                     const SparseCooTensor& x,
                     const DenseTensor& kernel,
                     const std::vector<int>& paddings,
                     const std::vector<int>& dilations,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
220
                     const std::string& key,
Z
zhangkaihuo 已提交
221
                     SparseCooTensor* out,
222 223
                     DenseTensor* rulebook,
                     DenseTensor* counter) {
224
  PD_VISIT_INTEGRAL_TYPES(
Z
zhangkaihuo 已提交
225 226 227 228 229 230 231 232 233
      x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] {
        Conv3dCooGPUKernel<T, data_t>(dev_ctx,
                                      x,
                                      kernel,
                                      paddings,
                                      dilations,
                                      strides,
                                      groups,
                                      subm,
234
                                      key,
Z
zhangkaihuo 已提交
235
                                      out,
236 237
                                      rulebook,
                                      counter);
238 239
      }));
}
240 241 242 243

}  // namespace sparse
}  // namespace phi

Z
zhangkaihuo 已提交
244
PD_REGISTER_KERNEL(conv3d_coo,
245 246
                   GPU,
                   ALL_LAYOUT,
Z
zhangkaihuo 已提交
247
                   phi::sparse::Conv3dCooKernel,
248 249 250 251 252
                   float,
                   double,
                   phi::dtype::float16) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}