groupnorm nhwc8 (#49160)

* gn nhwc8 * remove error

groupnorm nhwc8 (#49160)
* gn nhwc8 * remove error
babd26ee · wenbin · GitHub · 6439e91d · babd26ee · babd26ee
3 changed file
--- a/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
+++ b/paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+ * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <stdint.h>
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+struct GroupNormNHWCParams {
+  // The output buffer. Layout NHWC.
+  __half* dst;
+  // The input buffer. Layout NHWC.
+  __half const* srcX;
+  // The input buffer. Layout NHWC.
+  __half const* srcY;
+  // The gamma scaling factor.
+  void const* gamma;
+  // The beta term to add in GN.
+  void const* beta;
+  // The temporary buffer to do the global parallel reduction. Size:
+  // BLOCKS_PER_BATCH x C x 2.
+  float* redBuffer;
+  // The number of instances in the batch.
+  int32_t n;
+  // The height and width of each activation map.
+  int32_t h, w;
+  // The number of channels.
+  int32_t c;
+  // The number of groups.
+  int32_t groups;
+  // Do we apply the Swish activation function?
+  bool withSwish;
+  // Precomputed values and parameters to control the execution of the kernels.
+  // The number of activations per instance (h * w) and the number of
+  // activations per block.
+  int32_t hw, hwPerBlock;
+  // The number of channels per group and blocks per activation in the C
+  // dimension.
+  int32_t cPerBlock, cPerGroup;
+  // The precomputed stride between instances.
+  int32_t hwc;
+  // The inverse of hwc in floats (to compute mean/var).
+  float invHWC;
+  // The precomputed number of groups per block.
+  int32_t groupsPerBlock;
+  // epsilon, Constant for numerical stability
+  float eps;
+};
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.cu
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION &
+AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h"
 #include "paddle/phi/kernels/group_norm_kernel.h"
+#include <cub/cub.cuh>
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -25,6 +28,262 @@ namespace tensorrt {
 namespace plugin {
 using DataLayout = phi::DataLayout;
+static inline int32_t divUp(int32_t m, int32_t n) { return (m + n - 1) / n; }
+static inline __device__ __host__ float sigmoid(float x) {
+  return 1.F / (1.F + expf(-x));
+}
+struct GroupSums {
+  // Is it the 1st element of the group?
+  int32_t flag;
+  // The sum.
+  float sum;
+  // The sum of squares.
+  float sumSq;
+};
+struct GroupSumsOp {
+  inline __device__ GroupSums operator()(GroupSums const &a,
+                                         GroupSums const &b) {
+    GroupSums dst;
+    dst.sum = b.flag ? b.sum : (a.sum + b.sum);
+    dst.sumSq = b.flag ? b.sumSq : (a.sumSq + b.sumSq);
+    dst.flag = a.flag + b.flag;
+    return dst;
+  }
+};
+static int32_t findMaxDivisor(int32_t n, int32_t maxAllowedDivisor) {
+  int32_t maxDivisor = -1;
+  for (int32_t i = 1; i <= std::sqrt(n); i++) {
+    if (n % i == 0) {
+      int32_t divisor1 = n / i;
+      int32_t divisor2 = i;
+      if (divisor1 > maxDivisor && divisor1 < maxAllowedDivisor) {
+        maxDivisor = divisor1;
+      }
+      if (divisor2 > maxDivisor && divisor2 < maxAllowedDivisor) {
+        maxDivisor = divisor2;
+      }
+    }
+  }
+  return maxDivisor;
+}
+template <int tTHREADS_PER_BLOCK>
+__global__ void groupNormNHWCSumKernel(const GroupNormNHWCParams params) {
+  // The object in charge of doing the sums for the different blocks.
+  typedef cub::BlockScan<GroupSums, tTHREADS_PER_BLOCK> BlockScan;
+  // Allocate shared memory for BlockScan.
+  __shared__ typename BlockScan::TempStorage tempStorage;
+  // Allocate shared memory for the groups. We could reduce the amount of shared
+  // memory reserved.
+  __shared__ float2 smem[tTHREADS_PER_BLOCK];
+  // The instance in the batch.
+  int32_t ni = blockIdx.z;
+  // The channel loaded by that thread (2 channels per thread for F16x2).
+  int32_t ci = blockIdx.x * params.cPerBlock + threadIdx.x * 2;
+  // The first activation loaded by that block.
+  int32_t hwBegin = blockIdx.y * params.hwPerBlock;
+  // The last activation loaded by that block.
+  int32_t hwEnd = min(hwBegin + params.hwPerBlock, params.hw);
+  // The sums.
+  float sum = 0.F;
+  float sumSq = 0.F;
+  // Iterate over the activations to compute the sums.
+  for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) {
+    // The offset.
+    int64_t offset = static_cast<int64_t>(ni) * params.hwc +
+                     static_cast<int64_t>(hwi) * params.c + ci;
+    // Fetch two channels per thread.
+    __half2 h2(0, 0);
+    if (ci < params.c) {
+      h2 = *reinterpret_cast<__half2 const *>(&params.srcX[offset]);
+    }
+    // Extract the two half values.
+    float2 f2 = __half22float2(h2);
+    // Update the sum.
+    sum += f2.x + f2.y;
+    // Update the sum of squares.
+    sumSq += f2.x * f2.x + f2.y * f2.y;
+  }
+  // The group that thread works on and the channel in the group (modulus).
+  int32_t gi = threadIdx.x * 2 / params.cPerGroup;
+  int32_t cj = threadIdx.x * 2 - params.cPerGroup * gi;
+  // The data for the summations.
+  GroupSums inp{cj == 0 ? 1 : 0, sum, sumSq};
+  // Do the segmented scan.
+  GroupSums out;
+  BlockScan(tempStorage).InclusiveScan(inp, out, GroupSumsOp());
+  // Store the results for the groups in shared memory (to produce coalesced
+  // stores later).
+  // 2 channels per thread
+  if (cj == params.cPerGroup - 2) {
+    smem[gi] = make_float2(out.sum, out.sumSq);
+  }
+  // Make sure the data is in shared memory.
+  __syncthreads();
+  // The global group index.
+  int32_t gj = blockIdx.x * params.groupsPerBlock + threadIdx.x;
+  // Threads that have nothing left to do, exit.
+  if (threadIdx.x >= params.groupsPerBlock || gj >= params.groups) {
+    return;
+  }
+  // The first threads (those storing to global memory, load the values).
+  float2 sums = smem[threadIdx.x];
+  // Store to global memory.
+  atomicAdd(&params.redBuffer[(2 * ni + 0) * params.groups + gj], sums.x);
+  atomicAdd(&params.redBuffer[(2 * ni + 1) * params.groups + gj], sums.y);
+}
+void groupNormNHWCSum(const GroupNormNHWCParams &params, cudaStream_t stream) {
+  dim3 grid;
+  // The number of blocks to compute all the channels.
+  grid.x = params.c / params.cPerBlock;
+  // The number of blocks to compute all the activations in a given instance.
+  grid.y = divUp(params.hw, params.hwPerBlock);
+  // The number of instances.
+  grid.z = params.n;
+  switch (params.cPerBlock) {
+    case 320:
+      groupNormNHWCSumKernel<160><<<grid, 160, 0, stream>>>(params);
+      break;
+    case 480:
+      groupNormNHWCSumKernel<256><<<grid, 256, 0, stream>>>(params);
+      break;
+    case 256:
+      groupNormNHWCSumKernel<128><<<grid, 128, 0, stream>>>(params);
+      break;
+    case 128:
+      groupNormNHWCSumKernel<64><<<grid, 64, 0, stream>>>(params);
+      break;
+  }
+}
+template <int tTHREADS_PER_BLOCK>
+__global__ void groupNormNHWCScaleKernel(const GroupNormNHWCParams params) {
+  // The instance in the batch.
+  int32_t ni = blockIdx.z;
+  // The channel loaded by that thread (2 channels per thread for F16x2).
+  int32_t ci = blockIdx.x * params.cPerBlock + threadIdx.x * 2;
+  // The group that thread works on and the channel in the group (modulus).
+  int32_t gi = ci / params.cPerGroup;
+  // Load the sum and sum of squares for the group.
+  float sum = 0.F, sumSq = 0.F;
+  if (gi < params.groups) {
+    sum = params.redBuffer[(2 * ni + 0) * params.groups + gi];
+    sumSq = params.redBuffer[(2 * ni + 1) * params.groups + gi];
+  }
+  // Load gamma/beta.
+  float2 gammaF2, betaF2;
+  if (ci < params.c) {
+    gammaF2 = __half22float2(*reinterpret_cast<half2 const *>(
+        reinterpret_cast<half const *>(params.gamma) + ci));
+    betaF2 = __half22float2(*reinterpret_cast<half2 const *>(
+        reinterpret_cast<half const *>(params.beta) + ci));
+  }
+  // Compute the mean.
+  float mean = sum * params.invHWC;
+  // Compute the variance.
+  float var = sumSq * params.invHWC - (mean * mean);
+  // Compute the inverse of the stddev.
+  float invStdDev = rsqrtf(var + params.eps);
+  // The first activation loaded by that block.
+  int32_t hwBegin = blockIdx.y * params.hwPerBlock;
+  // The last activation loaded by that block.
+  int32_t hwEnd = min(hwBegin + params.hwPerBlock, params.hw);
+  // Iterate over the activations to compute the sums.
+  for (int32_t hwi = hwBegin; hwi < hwEnd; ++hwi) {
+    // The src/dst offset.
+    int64_t offset = (int64_t)ni * params.hwc + hwi * params.c + ci;
+    // Fetch two channels per thread.
+    __half2 h2(0, 0);
+    if (ci < params.c) {
+      h2 = *reinterpret_cast<__half2 const *>(&params.srcX[offset]);
+    }
+    // Extract the two half values.
+    float2 f2 = __half22float2(h2);
+    // Normalize the channels.
+    f2.x = (f2.x - mean) * invStdDev;
+    f2.y = (f2.y - mean) * invStdDev;
+    // Scale by gamma and add beta.
+    f2.x = gammaF2.x * f2.x + betaF2.x;
+    f2.y = gammaF2.y * f2.y + betaF2.y;
+    // Apply Swish if needed.
+    if (params.withSwish) {
+      f2.x = f2.x * sigmoid(f2.x);
+      f2.y = f2.y * sigmoid(f2.y);
+    }
+    // Store the scaled values.
+    if (ci < params.c) {
+      *reinterpret_cast<__half2 *>(&params.dst[offset]) = __float22half2_rn(f2);
+    }
+  }
+}
+void groupNormNHWCScale(const GroupNormNHWCParams &params,
+                        cudaStream_t stream) {
+  dim3 grid;
+  // The number of blocks to compute all the channels.
+  grid.x = params.c / params.cPerBlock;
+  // The number of blocks to compute all the activations in a given instance.
+  grid.y = divUp(params.hw, params.hwPerBlock);
+  // The number of instances.
+  grid.z = params.n;
+  switch (params.cPerBlock) {
+    case 320:
+      groupNormNHWCScaleKernel<160><<<grid, 160, 0, stream>>>(params);
+      break;
+    case 480:
+      groupNormNHWCScaleKernel<256><<<grid, 256, 0, stream>>>(params);
+      break;
+    case 256:
+      groupNormNHWCScaleKernel<128><<<grid, 128, 0, stream>>>(params);
+      break;
+    case 128:
+      groupNormNHWCScaleKernel<64><<<grid, 64, 0, stream>>>(params);
+      break;
+    default:
+      PADDLE_THROW(
+          platform::errors::Fatal("The function groupNormNHWCScale of "
+                                  "GroupNorm TRT Plugin encounter error"));
+  }
+}
 int GroupNormPlugin::initialize() TRT_NOEXCEPT {
  if (!with_fp16_) {
    // if use fp32
@@ -188,7 +447,8 @@ bool GroupNormPluginDynamic::supportsFormatCombination(
  if (pos == 0) {
    if (with_fp16_) {
      return ((in.type == nvinfer1::DataType::kHALF) &&
-              (in.format == nvinfer1::PluginFormat::kLINEAR));
+              (in.format == nvinfer1::PluginFormat::kLINEAR ||
+               in.format == nvinfer1::PluginFormat::kHWC8));
    } else {
      return (in.type == nvinfer1::DataType::kFLOAT) &&
             (in.format == nvinfer1::TensorFormat::kLINEAR);
@@ -275,9 +535,7 @@ int GroupNormPluginDynamic::enqueue(
  int C = input_shape[1];
  int image_size = input_shape[2] * input_shape[3];
  int batchSize = input_shape[0];
-  std::vector<int64_t> batched_mean_shape = {batchSize * mean_shape_[0]};
-  std::vector<int64_t> batched_variance_shape = {batchSize *
-                                                 variance_shape_[0]};
  PADDLE_ENFORCE_EQ(
      C,
      scale_.size(),
@@ -320,7 +578,7 @@ int GroupNormPluginDynamic::enqueue(
    VLOG(1) << "TRT Plugin DataType selected. GroupNorm-->fp16";
    const half *input = reinterpret_cast<const half *>(inputs[0]);
    half *output = static_cast<half *>(outputs[0]);
+    if (input_desc[0].format == nvinfer1::PluginFormat::kLINEAR) {
      phi::GroupNormDirectCUDAFunctor<half, float> group_norm;
      group_norm(stream,
                 input,
@@ -334,11 +592,62 @@ int GroupNormPluginDynamic::enqueue(
                 mean_d,
                 variance_d,
                 DataLayout::kNCHW);
+    } else if (input_desc[0].format == nvinfer1::PluginFormat::kHWC8) {
+      int32_t cPerBlock = 320;
+      int32_t maxBlocksPerHW = 1024;
+      switch (input_desc[0].dims.d[1]) {
+        case 960:
+        case 1920:
+          cPerBlock = 480;
+          break;
+        case 512:
+        case 256:
+          cPerBlock = 256;
+          break;
+        case 128:
+          cPerBlock = 128;
+          break;
+        default:
+          cPerBlock = 320;
+      }
+      params_.withSwish = false;
+      params_.dst = static_cast<half *>(outputs[0]);
+      params_.srcX = static_cast<half const *>(inputs[0]);
+      params_.gamma = scale_gpu_;
+      params_.beta = bias_gpu_;
+      params_.redBuffer = static_cast<float *>(workspace);
+      params_.n = input_desc[0].dims.d[0];
+      params_.h = input_desc[0].dims.d[2];
+      params_.w = input_desc[0].dims.d[3];
+      params_.c = input_desc[0].dims.d[1];
+      params_.groups = groups_;
+      params_.hw = params_.h * params_.w;
+      const int32_t blocksPerHW = findMaxDivisor(params_.hw, maxBlocksPerHW);
+      params_.hwPerBlock = divUp(params_.hw, blocksPerHW);
+      params_.cPerBlock = cPerBlock;
+      params_.cPerGroup = params_.c / params_.groups;
+      params_.hwc = params_.hw * params_.c;
+      params_.invHWC = 1.F / static_cast<float>(params_.hw * params_.cPerGroup);
+      params_.groupsPerBlock = cPerBlock / params_.cPerGroup;
+      params_.eps = eps_;
+      cudaMemsetAsync(params_.redBuffer,
+                      0,
+                      2 * sizeof(float) * params_.n * groups_,
+                      stream);
+      groupNormNHWCSum(params_, stream);
+      groupNormNHWCScale(params_, stream);
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "The Groupnorm TRT Plugin's only support nchw or nhwc8 input"));
+    }
  } else {
    // input not float
    PADDLE_THROW(platform::errors::Fatal(
-        "The Groupnorm TRT Plugin's only support fp32 input"));
+        "The Groupnorm TRT Plugin's only support fp32 or fp16 input"));
  }
  return cudaGetLastError() != cudaSuccess;
 }

--- a/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/group_norm_op_plugin.h
@@ -21,7 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/common/groupNormPluginCommon.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 namespace paddle {
 namespace inference {
 namespace tensorrt {
@@ -274,6 +276,7 @@ class GroupNormPluginDynamic : public DynamicPluginTensorRT {
  float eps_;
  std::vector<int64_t> mean_shape_;
  std::vector<int64_t> variance_shape_;
+  GroupNormNHWCParams params_;
  bool with_fp16_;
 };
 class GroupNormPluginDynamicCreator : public TensorRTPluginCreator {