未验证 提交 626c1edc 编写于 作者: s.feng's avatar s.feng 提交者: GitHub

fix the bug of yolo_box which can't run on nano and tx2 (#33422)

上级 a6b33281
......@@ -120,7 +120,14 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
KeYoloBoxFw<T><<<config.block_per_grid, config.thread_per_block, 0,
dim3 thread_num = config.thread_per_block;
#ifdef WITH_NV_JETSON
if (config.compute_capability == 53 || config.compute_capability == 62) {
thread_num = 512;
}
#endif
KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
ctx.cuda_device_context().stream()>>>(
input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
......
......@@ -37,6 +37,7 @@ struct GpuLaunchConfig {
dim3 theory_thread_count = dim3(1, 1, 1);
dim3 thread_per_block = dim3(1, 1, 1);
dim3 block_per_grid = dim3(1, 1, 1);
int compute_capability = 0;
};
inline GpuLaunchConfig GetGpuLaunchConfig1D(
......@@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
std::min(max_threads, context.GetMaxThreadsPerBlock());
const int block_count =
std::min(DivUp(physical_thread_count, thread_per_block), sm);
// Get compute_capability
const int capability = context.GetComputeCapability();
GpuLaunchConfig config;
config.theory_thread_count.x = theory_thread_count;
config.thread_per_block.x = thread_per_block;
config.block_per_grid.x = block_count;
config.compute_capability = capability;
return config;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册