conv_kernel.cu 11.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangkaihuo 已提交
15
#include "paddle/phi/kernels/sparse/conv_kernel.h"
16

17 18 19
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
20
#include "paddle/phi/core/visit_type.h"
21
#include "paddle/phi/kernels/funcs/blas/blas.h"
22
#include "paddle/phi/kernels/funcs/scatter.cu.h"
23
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
24
#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
25 26 27
#ifdef PADDLE_WITH_CUTLASS
#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h"
#endif
28 29

#include "glog/logging.h"
30 31 32 33

namespace phi {
namespace sparse {

34
template <typename T, typename IntT>
Z
zhangkaihuo 已提交
35 36 37 38 39 40 41 42
void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                        const SparseCooTensor& x,
                        const DenseTensor& kernel,
                        const std::vector<int>& paddings,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
43
                        const std::string& key,
Z
zhangkaihuo 已提交
44
                        SparseCooTensor* out,
45 46
                        DenseTensor* rulebook,
                        DenseTensor* counter) {
47 48 49 50 51 52 53
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
  const auto& x_dims = x.dims();
  const auto& kernel_dims = kernel.dims();
  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  DDim out_dims = {1, 1, 1, 1, 1};
Z
zhangkaihuo 已提交
54 55 56 57
  std::vector<int> kernel_sizes(kernel_dims.size());
  for (int i = 0; i < kernel_dims.size(); i++) {
    kernel_sizes[i] = kernel_dims[i];
  }
58 59 60 61 62 63 64 65 66

  std::vector<int> subm_paddings(paddings), subm_strides(strides);
  if (subm) {
    // the out shape of subm_conv is same as input shape
    // reset the padding=kernel_size/2 and strides=1
    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
        kernel.dims(), &subm_paddings, &subm_strides);
  }

67
  phi::funcs::sparse::GetOutShape(
68
      x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
69 70
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
71
  DenseTensor h_counter, h_offsets;
72
  h_counter.Resize({kernel_size});
73 74 75
  h_offsets.Resize({kernel_size + 1});
  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
76 77 78 79

  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
80
  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});

  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
  int rulebook_len = 0;
  const IntT* rulebook_ptr = nullptr;
  bool need_product_rulebook = true;
  if (subm && !key.empty()) {
    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
        dev_ctx,
        x,
        key,
        out_dims,
        out,
        h_counter.data<int>(),
        h_offsets.data<int>(),
        &rulebook_len,
        &need_product_rulebook);
  }

  if (need_product_rulebook) {
    DenseTensor tmp_rulebook;
    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                                        x,
                                                        kernel_sizes,
                                                        subm_paddings,
                                                        dilations,
                                                        subm_strides,
                                                        out_dims,
                                                        subm,
                                                        &tmp_rulebook,
                                                        &counter_per_kernel,
                                                        &offsets_per_kernel,
                                                        &out_index,
                                                        &unique_value,
                                                        out,
                                                        h_counter_ptr,
                                                        h_offsets_ptr);
    rulebook_ptr = tmp_rulebook.data<IntT>();

    phi::funcs::sparse::SaveToTable(
        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
  }
125

126 127 128 129 130 131
#ifdef PADDLE_WITH_CUTLASS
  bool cutlass = true;
  if (dev_ctx.GetComputeCapability() < 75) cutlass = false;
  if (in_channels % 4 != 0 || out_channels % 4 != 0) {
    if (std::is_same<T, phi::dtype::float16>::value) cutlass = false;
    if (std::is_same<T, float>::value) cutlass = false;
132
  }
133 134 135 136 137 138 139 140 141 142 143 144
  if (!std::is_same<IntT, int32_t>::value) cutlass = false;
  if (cutlass) {
    auto* out_values = out->mutable_non_zero_elements();
    T* out_values_ptr = out_values->data<T>();
    phi::funcs::SetConstant<GPUContext, T> set_zero;
    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));

    const T* kernel_ptr = kernel.data<T>();
    for (int i = 0; i < kernel_size; i++) {
      if (h_counter_ptr[i] <= 0) {
        continue;
      }
145

146 147 148 149 150 151 152
      const int M = h_counter_ptr[i];
      const int K = in_channels;
      const int N = out_channels;
      const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
      const IntT* gather_indices = rulebook_ptr + h_offsets_ptr[i];
      const IntT* scatter_indices =
          rulebook_ptr + rulebook_len + h_offsets_ptr[i];
153 154 155 156 157 158 159 160 161 162 163 164
      dispatchKernel(dev_ctx,
                     x.non_zero_elements().data<T>(),
                     tmp_kernel_ptr,
                     out_values_ptr,
                     out_values_ptr,
                     M,
                     N,
                     K,
                     gather_indices,
                     scatter_indices,
                     cutlass,
                     x.dtype());
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
    }
  } else {
#endif
    if (subm) {
      auto config =
          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
      unique_value.ResizeAndAllocate(
          {static_cast<int>(out->nnz() * kernel_size)});
      out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
      int* out_index_ptr = out_index.data<int>();
      int* unique_value_ptr = unique_value.data<int>();
      phi::backends::gpu::GpuMemsetAsync(
          out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
      GroupIndexs<<<config.block_per_grid,
                    config.thread_per_block,
                    0,
                    dev_ctx.stream()>>>(rulebook_len,
                                        kernel_size,
                                        rulebook_ptr + rulebook_len,
                                        out_index_ptr,
                                        unique_value_ptr);
186
    }
187 188 189 190 191 192 193 194 195
    // 2. gather
    phi::DenseTensor in_features =
        phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
    phi::DenseTensor out_features =
        phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
    T* in_features_ptr = in_features.data<T>();
    T* out_features_ptr = out_features.data<T>();
    phi::funcs::SetConstant<GPUContext, T> set_zero;
    set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
196

197 198 199 200 201 202
    Gather<T, IntT>(dev_ctx,
                    x.values().data<T>(),
                    rulebook_ptr,
                    rulebook_len,
                    in_channels,
                    in_features_ptr);
203

204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
    // 3. call gemm for every werght
    auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
    auto* out_values = out->mutable_values();
    T* out_values_ptr = out_values->data<T>();
    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));

    const T* kernel_ptr = kernel.data<T>();
    for (int i = 0; i < kernel_size; i++) {
      if (h_counter_ptr[i] <= 0) {
        continue;
      }

      // call gemm: (n, in_channels) * (in_channels, out_channels)
      const int M = h_counter_ptr[i];
      const int K = in_channels;
      const int N = out_channels;
      T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
      const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
      T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;

      blas.GEMM(CblasNoTrans,
                CblasNoTrans,
                M,
                N,
                K,
                static_cast<T>(1),
                tmp_in_ptr,
                tmp_kernel_ptr,
                static_cast<T>(0),
                tmp_out_ptr);
    }

    // 4. scatter
    phi::funcs::sparse::ScatterV2<T>(dev_ctx,
                                     out_features_ptr,
                                     out_index.data<int>(),
                                     unique_value.data<int>(),
                                     out->nnz(),
                                     kernel_size,
                                     out_channels,
                                     1,
                                     out_values_ptr);
#ifdef PADDLE_WITH_CUTLASS
  }
#endif
249
}
250

251
/**
252 253 254 255 256
 * x: the input SparseCooTensor, shape is (N, D, H, W, C)
 * kernel: the weight data, shape is (D, H, W, C, OC)
 * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
 * rulebook: return rulebook if key is not vailed else return nullptr
 * counter: return counter if key is not vailed else return nullptr
257
 **/
258
template <typename T, typename Context>
Z
zhangkaihuo 已提交
259 260 261 262 263 264 265 266
void Conv3dCooKernel(const Context& dev_ctx,
                     const SparseCooTensor& x,
                     const DenseTensor& kernel,
                     const std::vector<int>& paddings,
                     const std::vector<int>& dilations,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
267
                     const std::string& key,
Z
zhangkaihuo 已提交
268
                     SparseCooTensor* out,
269 270
                     DenseTensor* rulebook,
                     DenseTensor* counter) {
271 272 273 274 275 276 277 278 279 280 281 282 283 284
  PD_VISIT_BASE_INTEGRAL_TYPES(x.indices().dtype(), "Conv3dCooGPUKernel", ([&] {
                                 Conv3dCooGPUKernel<T, data_t>(dev_ctx,
                                                               x,
                                                               kernel,
                                                               paddings,
                                                               dilations,
                                                               strides,
                                                               groups,
                                                               subm,
                                                               key,
                                                               out,
                                                               rulebook,
                                                               counter);
                               }));
285
}
286 287 288 289

}  // namespace sparse
}  // namespace phi

Z
zhangkaihuo 已提交
290
PD_REGISTER_KERNEL(conv3d_coo,
291 292
                   GPU,
                   ALL_LAYOUT,
Z
zhangkaihuo 已提交
293
                   phi::sparse::Conv3dCooKernel,
294 295 296 297 298
                   float,
                   double,
                   phi::dtype::float16) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}