From 0d0ea9b79f61675a196c21e2f99fbb826c481fbe Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 7 Feb 2020 20:50:41 +0800
Subject: [PATCH] optimize performance of interpolate op (#22436) (#22489)

* optimize interpolate op.
test=develop
---
 paddle/fluid/operators/interpolate_op.cu  | 46 ++++++++++++----------
 paddle/fluid/platform/gpu_launch_config.h | 48 +++++++++++++++++++++++
 2 files changed, 74 insertions(+), 20 deletions(-)
 create mode 100644 paddle/fluid/platform/gpu_launch_config.h
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 6121389c128..d12a25f360e 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -9,9 +9,11 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -586,17 +588,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_chw = c * out_hw;
 
   int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
         input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
         out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
   }
@@ -696,12 +699,13 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_cdhw = c * out_dhw;
 
   int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
         input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
         out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
         align_mode, data_layout);
@@ -787,17 +791,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int out_chw = c * out_hw;
 
   int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
 
   if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
+                                   ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                            ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
         data_layout);
@@ -892,12 +897,13 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   int out_cdhw = c * out_dhw;
 
   int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
 
   if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
+                             ctx.cuda_device_context().stream()>>>(
         input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
         out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
         align_mode, data_layout);
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
new file mode 100644
index 00000000000..d57478b8978
--- /dev/null
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace platform {
+
+struct GpuLaunchConfig {
+  // Number of threads per block.
+  int threads;
+  // Number of blocks for GPU kernel launch.
+  int blocks;
+
+  GpuLaunchConfig(int threads, int blocks) : threads(threads), blocks(blocks) {}
+};
+
+inline GpuLaunchConfig getGpuLaunchConfig(
+    const int N, const framework::ExecutionContext& ctx) {
+  int threads =
+      std::min(1024, ctx.cuda_device_context().GetMaxThreadsPerBlock());
+  int physical_thread_count =
+      std::min(ctx.cuda_device_context().GetMaxPhysicalThreadCount(), N);
+  int blocks = std::min((physical_thread_count + threads - 1) / threads,
+                        ctx.cuda_device_context().GetSMCount());
+
+  GpuLaunchConfig config(threads, blocks);
+
+  return config;
+}
+
+}  // namespace platform
+}  // namespace paddle
-- 
GitLab