!4958 Fix GPU-ArgMaxWithValue

Merge pull request !4958 from 34bunny/GPU-argmaxwithvalue-fix

!4958 Fix GPU-ArgMaxWithValue
Merge pull request !4958 from 34bunny/GPU-argmaxwithvalue-fix
660aa8e6 · mindspore-ci-bot · Gitee · 469b132c · 92082038 · 660aa8e6
隐藏空白更改
内联并排

Showing with 23 addition and 28 deletion

mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/argmaxwithvalue_impl.cu ...end/kernel_compiler/gpu/cuda_impl/argmaxwithvalue_impl.cu +23 -28

未找到文件。
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/argmaxwithvalue_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/argmaxwithvalue_impl.cu
@@ -18,39 +18,34 @@
 #include "runtime/device/gpu/cuda_common.h"
 #include "include/cuda_fp16.h"
 template <typename T, typename S>
-__global__ void ArgmaxWithValue(const T* input, const int bound, int outerSize, int innerSize, S* index,
-                                T* output) {
-  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (outerSize); pos += blockDim.x * gridDim.x) {
-      int inputOutterOffset = pos * innerSize * bound;
-      int outputOutterOffset = pos * innerSize;
-      for (int j = 0; j < innerSize; j++) {
-          auto outputInnerOffset = outputOutterOffset + j;
-          S idx = 0;
-          T maxData = input[j + inputOutterOffset];
-          for (S c = 0; c < bound; c++) {
-              int offset = j + c * innerSize;
-              auto inputData = input[inputOutterOffset + offset];
-              idx = inputData > maxData ? c : idx;
-              maxData = inputData > maxData ? inputData : maxData;
-          }
-          output[outputInnerOffset] = maxData;
-          index[outputInnerOffset] = idx;
-      }
+__global__ void ArgmaxWithValue(const T *input, const int bound, int outerSize, int innerSize, S *index, T *output) {
+  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < outerSize * innerSize; pos += gridDim.x * blockDim.x) {
+    int x = pos / innerSize % outerSize;
+    int y = pos % innerSize;
+    S idx = 0;
+    int InputOffset = x * bound * innerSize + 0 * innerSize + y;
+    T maxData = input[InputOffset];
+    for (int i = 0; i < bound; i++) {
+      InputOffset = x * bound * innerSize + i * innerSize + y;
+      auto inputData = input[InputOffset];
+      idx = inputData > maxData ? i : idx;
+      maxData = inputData > maxData ? inputData : maxData;
+    }
+    output[pos] = maxData;
+    index[pos] = idx;
  }
  return;
 }

 template <typename T, typename S>
-void CalArgmaxWithValue(const T* input, const int bound_, const int outerSize_, const int innerSize_,
-                        S* index, T* output, cudaStream_t cuda_stream) {
-  ArgmaxWithValue<<<GET_BLOCKS(outerSize_), GET_THREADS, 0, cuda_stream>>>(input, bound_, outerSize_, innerSize_,
-                                                                           index, output);
+void CalArgmaxWithValue(const T *input, const int bound_, const int outerSize_, const int innerSize_, S *index,
+                        T *output, cudaStream_t cuda_stream) {
+  ArgmaxWithValue<<<GET_BLOCKS(outerSize_), GET_THREADS, 0, cuda_stream>>>(input, bound_, outerSize_, innerSize_, index,
+                                                                           output);
  return;
 }

-template void CalArgmaxWithValue<float, int>(const float* input, const int bound_, const int outerSize_,
-                                  const int innerSize_, int* index, float* output,
-                                  cudaStream_t cuda_stream);
-template void CalArgmaxWithValue<half, int>(const half* input, const int bound_, const int outerSize_,
-                                  const int innerSize_, int* index, half* output,
-                                  cudaStream_t cuda_stream);
+template void CalArgmaxWithValue<float, int>(const float *input, const int bound_, const int outerSize_,
+                                             const int innerSize_, int *index, float *output, cudaStream_t cuda_stream);
+template void CalArgmaxWithValue<half, int>(const half *input, const int bound_, const int outerSize_,
+                                            const int innerSize_, int *index, half *output, cudaStream_t cuda_stream);