softmax reconstruction and optimization (#31821)

63abd500 · xingfeng01 · GitHub · 8552a182 · 63abd500 · 63abd500
3 changed file
--- a/paddle/fluid/operators/softmax_cudnn_op.cu
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu
--- a/paddle/fluid/operators/softmax_impl.cuh
+++ b/paddle/fluid/operators/softmax_impl.cuh
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/fluid/platform/cuda_device_function.h"
+namespace paddle {
+namespace operators {
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceSum(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = sum[i] + sum_val;
+    }
+  }
+}
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceMax(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = max(sum[i], max_val);
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -45,6 +45,14 @@ static inline int SizeFromAxis(const int axis, DDim dims) {
  return size;
 }
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
 template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
 public: