optimize performance of interpolate op (#22436)

* optimize interpolate op, test=develop

optimize performance of interpolate op (#22436)
* optimize interpolate op, test=develop
2b1386b2 · LielinJiang · GitHub · 77dd0d97 · 2b1386b2 · 2b1386b2
隐藏空白更改
内联并排

Showing with 74 addition and 20 deletion

paddle/fluid/operators/interpolate_op.cu paddle/fluid/operators/interpolate_op.cu +26 -20

paddle/fluid/platform/gpu_launch_config.h paddle/fluid/platform/gpu_launch_config.h +48 -0

未找到文件。
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -9,9 +9,11 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
+#include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 namespace paddle {
 namespace operators {
@@ -586,17 +588,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
  int out_chw = c * out_hw;
  int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
  if ("nearest" == interp_method) {
-    KeNearestNeighborInterpFw<
+    KeNearestNeighborInterpFw<T><<<config.blocks, config.threads, 0,
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+                                   ctx.cuda_device_context().stream()>>>(
        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
        out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
  } else if ("bilinear" == interp_method) {
-    KeBilinearInterpFw<
+    KeBilinearInterpFw<T><<<config.blocks, config.threads, 0,
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+                            ctx.cuda_device_context().stream()>>>(
        input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n,
        out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout);
  }
@@ -696,12 +699,13 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
  int out_cdhw = c * out_dhw;
  int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
  if ("trilinear" == interp_method) {
-    KeTrilinearInterpFw<
+    KeTrilinearInterpFw<T><<<config.blocks, config.threads, 0,
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+                             ctx.cuda_device_context().stream()>>>(
        input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h,
        out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
        align_mode, data_layout);
@@ -787,17 +791,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
  int out_chw = c * out_hw;
  int pixelNum = n * out_chw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
  if ("nearest" == interp_method) {
-    KeNearestNeighborInterpBw<
+    KeNearestNeighborInterpBw<T><<<config.blocks, config.threads, 0,
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+                                   ctx.cuda_device_context().stream()>>>(
        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
        n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
  } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<
+    KeBilinearInterpBw<T><<<config.blocks, config.threads, 0,
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+                            ctx.cuda_device_context().stream()>>>(
        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
        data_layout);
@@ -892,12 +897,13 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
  int out_cdhw = c * out_dhw;
  int pixelNum = n * out_cdhw;
-  int grid_dim = (pixelNum + 512 - 1) / 512;
-  grid_dim = grid_dim > 8 ? 8 : grid_dim;
+  platform::GpuLaunchConfig config =
+      platform::getGpuLaunchConfig(pixelNum, ctx);
  if ("trilinear" == interp_method) {
-    KeTrilinearInterpBw<
+    KeTrilinearInterpBw<T><<<config.blocks, config.threads, 0,
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+                             ctx.cuda_device_context().stream()>>>(
        input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d,
        out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners,
        align_mode, data_layout);

--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/platform/cuda_primitives.h"
+namespace paddle {
+namespace platform {
+struct GpuLaunchConfig {
+  // Number of threads per block.
+  int threads;
+  // Number of blocks for GPU kernel launch.
+  int blocks;
+  GpuLaunchConfig(int threads, int blocks) : threads(threads), blocks(blocks) {}
+};
+inline GpuLaunchConfig getGpuLaunchConfig(
+    const int N, const framework::ExecutionContext& ctx) {
+  int threads =
+      std::min(1024, ctx.cuda_device_context().GetMaxThreadsPerBlock());
+  int physical_thread_count =
+      std::min(ctx.cuda_device_context().GetMaxPhysicalThreadCount(), N);
+  int blocks = std::min((physical_thread_count + threads - 1) / threads,
+                        ctx.cuda_device_context().GetSMCount());
+  GpuLaunchConfig config(threads, blocks);
+  return config;
+}
+}  // namespace platform
+}  // namespace paddle