fix the bug of yolo_box which can't run on nano and tx2 (#33422) (#33442)

d4967224 · s.feng · GitHub · 6385f5ee · d4967224 · d4967224
Showing with 12 addition and 1 deletion

paddle/fluid/operators/detection/yolo_box_op.cu paddle/fluid/operators/detection/yolo_box_op.cu +8 -1

paddle/fluid/platform/gpu_launch_config.h paddle/fluid/platform/gpu_launch_config.h +4 -0

未找到文件。
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -111,7 +111,14 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
    platform::GpuLaunchConfig config =
        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
-    KeYoloBoxFw<T><<<config.block_per_grid, config.thread_per_block, 0,
+    dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+    if (config.compute_capability == 53 || config.compute_capability == 62) {
+      thread_num = 512;
+    }
+#endif
+    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
                     ctx.cuda_device_context().stream()>>>(
        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
        anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,

--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -37,6 +37,7 @@ struct GpuLaunchConfig {
  dim3 theory_thread_count = dim3(1, 1, 1);
  dim3 thread_per_block = dim3(1, 1, 1);
  dim3 block_per_grid = dim3(1, 1, 1);
+  int compute_capability = 0;
 };
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
@@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
      std::min(max_threads, context.GetMaxThreadsPerBlock());
  const int block_count =
      std::min(DivUp(physical_thread_count, thread_per_block), sm);
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
  GpuLaunchConfig config;
  config.theory_thread_count.x = theory_thread_count;
  config.thread_per_block.x = thread_per_block;
  config.block_per_grid.x = block_count;
+  config.compute_capability = capability;
  return config;
 }