conv_kernel.cu 13.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Z
zhangkaihuo 已提交
15
#include "paddle/phi/kernels/sparse/conv_kernel.h"
16

17 18 19
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
20
#include "paddle/phi/core/visit_type.h"
21
#include "paddle/phi/kernels/funcs/blas/blas.h"
22
#include "paddle/phi/kernels/funcs/scatter.cu.h"
23
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
24
#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
25 26 27
#ifdef PADDLE_WITH_CUTLASS
#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h"
#endif
28 29

#include "glog/logging.h"
30 31 32 33

namespace phi {
namespace sparse {

34
template <typename T, typename IntT>
Z
zhangkaihuo 已提交
35 36 37 38 39 40 41 42
void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                        const SparseCooTensor& x,
                        const DenseTensor& kernel,
                        const std::vector<int>& paddings,
                        const std::vector<int>& dilations,
                        const std::vector<int>& strides,
                        const int groups,
                        const bool subm,
43
                        const std::string& key,
Z
zhangkaihuo 已提交
44
                        SparseCooTensor* out,
45 46
                        DenseTensor* rulebook,
                        DenseTensor* counter) {
47 48 49 50 51 52 53
  // update padding and dilation
  // Currently, only support x.layout is NDHWC, groups = 1
  // if x.layout != NDHWC then transpose(x), transpose(weight)
  const auto& x_dims = x.dims();
  const auto& kernel_dims = kernel.dims();
  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
  DDim out_dims = {1, 1, 1, 1, 1};
Z
zhangkaihuo 已提交
54 55 56 57
  std::vector<int> kernel_sizes(kernel_dims.size());
  for (int i = 0; i < kernel_dims.size(); i++) {
    kernel_sizes[i] = kernel_dims[i];
  }
58 59 60 61 62 63 64 65 66

  std::vector<int> subm_paddings(paddings), subm_strides(strides);
  if (subm) {
    // the out shape of subm_conv is same as input shape
    // reset the padding=kernel_size/2 and strides=1
    phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
        kernel.dims(), &subm_paddings, &subm_strides);
  }

67
  phi::funcs::sparse::GetOutShape(
68
      x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
69 70
  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];
71
  DenseTensor h_counter, h_offsets;
72
  h_counter.Resize({kernel_size});
73 74 75
  h_offsets.Resize({kernel_size + 1});
  int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
  int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
76 77 78 79

  // Second algorithm:
  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
  // 1. product rulebook
80
  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
  DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
  DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
  DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});

  VLOG(6) << "call SubmConv3D or Conv3D " << subm << " and the key is " << key;
  int rulebook_len = 0;
  const IntT* rulebook_ptr = nullptr;
  bool need_product_rulebook = true;
  if (subm && !key.empty()) {
    rulebook_ptr = phi::funcs::sparse::PrepareSubm<T, IntT, GPUContext>(
        dev_ctx,
        x,
        key,
        out_dims,
        out,
        h_counter.data<int>(),
        h_offsets.data<int>(),
        &rulebook_len,
        &need_product_rulebook);
  }

  if (need_product_rulebook) {
    DenseTensor tmp_rulebook;
    rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
                                                        x,
                                                        kernel_sizes,
                                                        subm_paddings,
                                                        dilations,
                                                        subm_strides,
                                                        out_dims,
                                                        subm,
                                                        &tmp_rulebook,
                                                        &counter_per_kernel,
                                                        &offsets_per_kernel,
                                                        &out_index,
                                                        &unique_value,
                                                        out,
                                                        h_counter_ptr,
                                                        h_offsets_ptr);
    rulebook_ptr = tmp_rulebook.data<IntT>();

    phi::funcs::sparse::SaveToTable(
        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
  }
125

126 127 128 129 130 131
#ifdef PADDLE_WITH_CUTLASS
  bool cutlass = true;
  if (dev_ctx.GetComputeCapability() < 75) cutlass = false;
  if (in_channels % 4 != 0 || out_channels % 4 != 0) {
    if (std::is_same<T, phi::dtype::float16>::value) cutlass = false;
    if (std::is_same<T, float>::value) cutlass = false;
132
  }
133 134 135 136 137 138 139 140 141 142 143 144
  if (!std::is_same<IntT, int32_t>::value) cutlass = false;
  if (cutlass) {
    auto* out_values = out->mutable_non_zero_elements();
    T* out_values_ptr = out_values->data<T>();
    phi::funcs::SetConstant<GPUContext, T> set_zero;
    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));

    const T* kernel_ptr = kernel.data<T>();
    for (int i = 0; i < kernel_size; i++) {
      if (h_counter_ptr[i] <= 0) {
        continue;
      }
145

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
      const int M = h_counter_ptr[i];
      const int K = in_channels;
      const int N = out_channels;
      const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
      const IntT* gather_indices = rulebook_ptr + h_offsets_ptr[i];
      const IntT* scatter_indices =
          rulebook_ptr + rulebook_len + h_offsets_ptr[i];

      if constexpr (std::is_same<T, phi::dtype::float16>::value &&
                    std::is_same<IntT, int32_t>::value) {
        fp16_gather_gemm_scatter gather_gemm_scatter =
            getBestFp16Kernel(M, N, K);
        gather_gemm_scatter(
            dev_ctx,
            reinterpret_cast<const cutlass::half_t*>(
                x.non_zero_elements().data<T>()),
            reinterpret_cast<const cutlass::half_t*>(tmp_kernel_ptr),
            reinterpret_cast<cutlass::half_t*>(out_values_ptr),
            reinterpret_cast<cutlass::half_t*>(out_values_ptr),
            M,
            N,
            K,
            static_cast<const int32_t*>(gather_indices),
            static_cast<const int32_t*>(scatter_indices),
            static_cast<cutlass::half_t>(1),
            static_cast<cutlass::half_t>(1));
      }
      if constexpr (std::is_same<T, float>::value &&
                    std::is_same<IntT, int32_t>::value) {
        fp32_gather_gemm_scatter gather_gemm_scatter =
            getBestFp32Kernel(M, N, K, dev_ctx.GetComputeCapability());
        gather_gemm_scatter(dev_ctx,
                            x.non_zero_elements().data<T>(),
                            tmp_kernel_ptr,
                            out_values_ptr,
                            out_values_ptr,
                            M,
                            N,
                            K,
                            gather_indices,
                            scatter_indices,
                            static_cast<T>(1),
                            static_cast<T>(1));
      }
      if constexpr (std::is_same<T, double>::value &&
                    std::is_same<IntT, int32_t>::value) {
        fp64_gather_gemm_scatter gather_gemm_scatter =
            getBestFp64Kernel(M, N, K);
        gather_gemm_scatter(dev_ctx,
                            x.non_zero_elements().data<T>(),
                            tmp_kernel_ptr,
                            out_values_ptr,
                            out_values_ptr,
                            M,
                            N,
                            K,
                            gather_indices,
                            scatter_indices,
                            static_cast<T>(1),
                            static_cast<T>(1));
      }
    }
  } else {
#endif
    if (subm) {
      auto config =
          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
      unique_value.ResizeAndAllocate(
          {static_cast<int>(out->nnz() * kernel_size)});
      out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
      int* out_index_ptr = out_index.data<int>();
      int* unique_value_ptr = unique_value.data<int>();
      phi::backends::gpu::GpuMemsetAsync(
          out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
      GroupIndexs<<<config.block_per_grid,
                    config.thread_per_block,
                    0,
                    dev_ctx.stream()>>>(rulebook_len,
                                        kernel_size,
                                        rulebook_ptr + rulebook_len,
                                        out_index_ptr,
                                        unique_value_ptr);
228
    }
229 230 231 232 233 234 235 236 237
    // 2. gather
    phi::DenseTensor in_features =
        phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
    phi::DenseTensor out_features =
        phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
    T* in_features_ptr = in_features.data<T>();
    T* out_features_ptr = out_features.data<T>();
    phi::funcs::SetConstant<GPUContext, T> set_zero;
    set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
238

239 240 241 242 243 244
    Gather<T, IntT>(dev_ctx,
                    x.values().data<T>(),
                    rulebook_ptr,
                    rulebook_len,
                    in_channels,
                    in_features_ptr);
245

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
    // 3. call gemm for every werght
    auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
    auto* out_values = out->mutable_values();
    T* out_values_ptr = out_values->data<T>();
    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));

    const T* kernel_ptr = kernel.data<T>();
    for (int i = 0; i < kernel_size; i++) {
      if (h_counter_ptr[i] <= 0) {
        continue;
      }

      // call gemm: (n, in_channels) * (in_channels, out_channels)
      const int M = h_counter_ptr[i];
      const int K = in_channels;
      const int N = out_channels;
      T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
      const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
      T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;

      blas.GEMM(CblasNoTrans,
                CblasNoTrans,
                M,
                N,
                K,
                static_cast<T>(1),
                tmp_in_ptr,
                tmp_kernel_ptr,
                static_cast<T>(0),
                tmp_out_ptr);
    }

    // 4. scatter
    phi::funcs::sparse::ScatterV2<T>(dev_ctx,
                                     out_features_ptr,
                                     out_index.data<int>(),
                                     unique_value.data<int>(),
                                     out->nnz(),
                                     kernel_size,
                                     out_channels,
                                     1,
                                     out_values_ptr);
#ifdef PADDLE_WITH_CUTLASS
  }
#endif
291
}
292

293
/**
294 295 296 297 298
 * x: the input SparseCooTensor, shape is (N, D, H, W, C)
 * kernel: the weight data, shape is (D, H, W, C, OC)
 * out: the output SparseCooTensor, shape is (N, D, H, W, OC)
 * rulebook: return rulebook if key is not vailed else return nullptr
 * counter: return counter if key is not vailed else return nullptr
299
 **/
300
template <typename T, typename Context>
Z
zhangkaihuo 已提交
301 302 303 304 305 306 307 308
void Conv3dCooKernel(const Context& dev_ctx,
                     const SparseCooTensor& x,
                     const DenseTensor& kernel,
                     const std::vector<int>& paddings,
                     const std::vector<int>& dilations,
                     const std::vector<int>& strides,
                     const int groups,
                     const bool subm,
309
                     const std::string& key,
Z
zhangkaihuo 已提交
310
                     SparseCooTensor* out,
311 312
                     DenseTensor* rulebook,
                     DenseTensor* counter) {
313 314 315 316 317 318 319 320 321 322 323 324 325 326
  PD_VISIT_BASE_INTEGRAL_TYPES(x.indices().dtype(), "Conv3dCooGPUKernel", ([&] {
                                 Conv3dCooGPUKernel<T, data_t>(dev_ctx,
                                                               x,
                                                               kernel,
                                                               paddings,
                                                               dilations,
                                                               strides,
                                                               groups,
                                                               subm,
                                                               key,
                                                               out,
                                                               rulebook,
                                                               counter);
                               }));
327
}
328 329 330 331

}  // namespace sparse
}  // namespace phi

Z
zhangkaihuo 已提交
332
PD_REGISTER_KERNEL(conv3d_coo,
333 334
                   GPU,
                   ALL_LAYOUT,
Z
zhangkaihuo 已提交
335
                   phi::sparse::Conv3dCooKernel,
336 337 338 339 340
                   float,
                   double,
                   phi::dtype::float16) {
  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}