conv_kernel.cu 9.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangkaihuo 已提交
15
#include "paddle/phi/kernels/sparse/conv_kernel.h"
16

17 18 19
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
20
#include "paddle/phi/core/visit_type.h"
21
#include "paddle/phi/kernels/funcs/blas/blas.h"
22
#include "paddle/phi/kernels/funcs/scatter.cu.h"
23
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
24 25 26
#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"

#include "glog/logging.h"
27 28 29 30

namespace phi {
namespace sparse {

31
template <typename T, typename IntT>
Z
zhangkaihuo 已提交
32 33 34 35 36 37 38 39
void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                        const SparseCooTensor& x,
                        const DenseTensor& kernel,
                        const std::vector<int>& paddings,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
40
                        const std::string& key,
Z
zhangkaihuo 已提交
41
                        SparseCooTensor* out,
42 43
                        DenseTensor* rulebook,
                        DenseTensor* counter) {
44 45 46 47 48 49 50
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
  const auto& x_dims = x.dims();
  const auto& kernel_dims = kernel.dims();
  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  DDim out_dims = {1, 1, 1, 1, 1};
Z
zhangkaihuo 已提交
51 52 53 54
  std::vector<int> kernel_sizes(kernel_dims.size());
  for (int i = 0; i < kernel_dims.size(); i++) {
    kernel_sizes[i] = kernel_dims[i];
  }
55 56 57 58 59 60 61 62 63

  std::vector<int> subm_paddings(paddings), subm_strides(strides);
  if (subm) {
    // the out shape of subm_conv is same as input shape
    // reset the padding=kernel_size/2 and strides=1
    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
        kernel.dims(), &subm_paddings, &subm_strides);
  }

64
  phi::funcs::sparse::GetOutShape(
65
      x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
66 67
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
68
  DenseTensor h_counter, h_offsets;
69
  h_counter.Resize({kernel_size + 1});
70 71 72
  h_offsets.Resize({kernel_size + 1});
  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
73 74 75 76

  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
77
  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size + 1});
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});

  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
  int rulebook_len = 0;
  const IntT* rulebook_ptr = nullptr;
  bool need_product_rulebook = true;
  if (subm && !key.empty()) {
    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
        dev_ctx,
        x,
        key,
        out_dims,
        out,
        h_counter.data<int>(),
        h_offsets.data<int>(),
        &rulebook_len,
        &need_product_rulebook);
  }

  if (need_product_rulebook) {
    DenseTensor tmp_rulebook;
    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                                        x,
                                                        kernel_sizes,
                                                        subm_paddings,
                                                        dilations,
                                                        subm_strides,
                                                        out_dims,
                                                        subm,
                                                        &tmp_rulebook,
                                                        &counter_per_kernel,
                                                        &offsets_per_kernel,
                                                        &out_index,
                                                        &unique_value,
                                                        out,
                                                        h_counter_ptr,
                                                        h_offsets_ptr);
    rulebook_ptr = tmp_rulebook.data<IntT>();

    phi::funcs::sparse::SaveToTable(
        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
  }
122 123 124

  // 2. gather
  phi::DenseTensor in_features =
125
      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
126
  phi::DenseTensor out_features =
127
      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
128 129
  T* in_features_ptr = in_features.data<T>();
  T* out_features_ptr = out_features.data<T>();
130
  phi::funcs::SetConstant<GPUContext, T> set_zero;
Z
zhangkaihuo 已提交
131
  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
132

133 134 135 136 137 138
  Gather<T, IntT>(dev_ctx,
                  x.non_zero_elements().data<T>(),
                  rulebook_ptr,
                  rulebook_len,
                  in_channels,
                  in_features_ptr);
139 140

  // 3. call gemm for every werght
141
  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
142 143
  auto* out_values = out->mutable_non_zero_elements();
  T* out_values_ptr = out_values->data<T>();
144 145
  set_zero(dev_ctx, out_values, static_cast<T>(0.0f));

146 147
  const T* kernel_ptr = kernel.data<T>();
  for (int i = 0; i < kernel_size; i++) {
148
    if (h_counter_ptr[i] <= 0) {
149 150 151 152
      continue;
    }

    // call gemm: (n, in_channels) * (in_channels, out_channels)
153
    const int M = h_counter_ptr[i];
154 155
    const int K = in_channels;
    const int N = out_channels;
156
    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
157
    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
158
    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
159 160 161 162 163 164 165 166 167 168 169 170 171 172

    blas.GEMM(CblasNoTrans,
              CblasNoTrans,
              M,
              N,
              K,
              static_cast<T>(1),
              tmp_in_ptr,
              tmp_kernel_ptr,
              static_cast<T>(0),
              tmp_out_ptr);
  }

  // 4. scatter
173 174 175 176 177 178
  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
                                   out_features_ptr,
                                   out_index.data<int>(),
                                   unique_value.data<int>(),
                                   out->nnz(),
                                   kernel_size,
179
                                   h_counter_ptr[kernel_size],
180 181 182
                                   out_channels,
                                   1,
                                   out_values_ptr);
183
}
184

185
/**
186 187 188 189 190
 * x: the input SparseCooTensor, shape is (N, D, H, W, C)
 * kernel: the weight data, shape is (D, H, W, C, OC)
 * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
 * rulebook: return rulebook if key is not vailed else return nullptr
 * counter: return counter if key is not vailed else return nullptr
191
 **/
192
template <typename T, typename Context>
Z
zhangkaihuo 已提交
193 194 195 196 197 198 199 200
void Conv3dCooKernel(const Context& dev_ctx,
                     const SparseCooTensor& x,
                     const DenseTensor& kernel,
                     const std::vector<int>& paddings,
                     const std::vector<int>& dilations,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
201
                     const std::string& key,
Z
zhangkaihuo 已提交
202
                     SparseCooTensor* out,
203 204
                     DenseTensor* rulebook,
                     DenseTensor* counter) {
Z
zhangkaihuo 已提交
205
  PD_VISIT_BASE_INTEGRAL_TYPES(
Z
zhangkaihuo 已提交
206 207 208 209 210 211 212 213 214
      x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] {
        Conv3dCooGPUKernel<T, data_t>(dev_ctx,
                                      x,
                                      kernel,
                                      paddings,
                                      dilations,
                                      strides,
                                      groups,
                                      subm,
215
                                      key,
Z
zhangkaihuo 已提交
216
                                      out,
217 218
                                      rulebook,
                                      counter);
219 220
      }));
}
221 222 223 224

}  // namespace sparse
}  // namespace phi

Z
zhangkaihuo 已提交
225
PD_REGISTER_KERNEL(conv3d_coo,
226 227
                   GPU,
                   ALL_LAYOUT,
Z
zhangkaihuo 已提交
228
                   phi::sparse::Conv3dCooKernel,
229 230 231 232 233
                   float,
                   double,
                   phi::dtype::float16) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}