Fix ce ocr_recognition test fails (#20987)

ocr_recognition fails, so add a path to handle small frame_size. test=develop

Fix ce ocr_recognition test fails (#20987)
ocr_recognition fails, so add a path to handle small frame_size. test=develop
0059404e · zhaoyuchen2018 · GitHub · f56967c4 · 0059404e · 0059404e
Showing with 39 addition and 30 deletion

paddle/fluid/operators/math/detail/gru_gpu_kernel.h paddle/fluid/operators/math/detail/gru_gpu_kernel.h +4 -7

paddle/fluid/operators/math/gru_compute.cu paddle/fluid/operators/math/gru_compute.cu +35 -23

未找到文件。
--- a/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_gpu_kernel.h
@@ -105,7 +105,7 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
 * threads(tile_size, 1)
 * grid(frame_blocks, 1)
 */
-template <class T>
+template <class T, int Tiled_size>
 __global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
                                        T *gate_weight, T *reset_output,
                                        int frame_size,
@@ -113,9 +113,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
  T xt_0 = 0.0f;
  T a0 = 0.0f;
  T c0 = 0.0f;
+  T b0[Tiled_size];
-  int Tiled_size = blockDim.x;
-  T b0[16];
  int COL = blockIdx.x * blockDim.x + threadIdx.x;
  int Tiled_mask = ((1 << Tiled_size) - 1);
@@ -165,7 +163,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value, T *prev_output_value,
 * threads(tile_size, 1)
 * grid(frame_blocks, 1)
 */
-template <class T>
+template <class T, int Tiled_size>
 __global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
                                       T *output_value, T *gate_value,
                                       T *reset_value, int frame_size,
@@ -174,10 +172,9 @@ __global__ void KeFastCollectiveGruOut(T *gate_weight, T *prev_out_value,
  int COL = blockIdx.x * blockDim.x + threadIdx.x;
  T a0 = 0.0f;
-  T b0[16];
+  T b0[Tiled_size];
  T c0 = 0.0f;
-  int Tiled_size = blockDim.x;
  int Tiled_mask = ((1 << Tiled_size) - 1);
  //- Tiled  matrix multiply with register shift
  if (prev_out_value) {

--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -31,29 +31,41 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
    dim3 grid;
    if (batch_size == 1) {
      if (context.GetComputeCapability() >= 70) {
-        auto ComputeTiledSize = [](int frame_size) {
+        if (frame_size < 16) {
-          if (frame_size >= 16)
+          constexpr int tiled_size = 8;
-            return 16;
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
-          else if (frame_size < 16)
+          threads = dim3(tiled_size, 1);
-            return 8;
+          grid = dim3(frame_blocks, 1);
-        };
+          detail::KeFastCollectiveGruGate<
+              T, tiled_size><<<grid, threads, 0, stream>>>(
-        auto tiled_size = ComputeTiledSize(frame_size);
+              value.gate_value, value.prev_out_value, value.gate_weight,
-        int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+              value.reset_output_value, frame_size, active_gate);
-        threads = dim3(tiled_size, 1);
-        grid = dim3(frame_blocks, 1);
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
-        detail::KeFastCollectiveGruGate<T><<<grid, threads, 0, stream>>>(
+          detail::KeFastCollectiveGruOut<
-            value.gate_value, value.prev_out_value, value.gate_weight,
+              T, tiled_size><<<grid, threads, 0, stream>>>(
-            value.reset_output_value, frame_size, active_gate);
+              value.state_weight, value.prev_out_value, value.output_value,
+              value.gate_value, value.reset_output_value, frame_size,
-        frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+              active_node, origin_mode);
-        grid = dim3(frame_blocks, 1);
+        } else {
-        detail::KeFastCollectiveGruOut<T><<<grid, threads, 0, stream>>>(
+          constexpr int tiled_size = 16;
-            value.state_weight, value.prev_out_value, value.output_value,
+          int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
-            value.gate_value, value.reset_output_value, frame_size, active_node,
+          threads = dim3(tiled_size, 1);
-            origin_mode);
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruGate<
+              T, tiled_size><<<grid, threads, 0, stream>>>(
+              value.gate_value, value.prev_out_value, value.gate_weight,
+              value.reset_output_value, frame_size, active_gate);
+          frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+          grid = dim3(frame_blocks, 1);
+          detail::KeFastCollectiveGruOut<
+              T, tiled_size><<<grid, threads, 0, stream>>>(
+              value.state_weight, value.prev_out_value, value.output_value,
+              value.gate_value, value.reset_output_value, frame_size,
+              active_node, origin_mode);
+        }
        return;
      } else {
        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;