From 0d0ea9b79f61675a196c21e2f99fbb826c481fbe Mon Sep 17 00:00:00 2001 From: LielinJiang <50691816+LielinJiang@users.noreply.github.com> Date: Fri, 7 Feb 2020 20:50:41 +0800 Subject: [PATCH] optimize performance of interpolate op (#22436) (#22489) * optimize interpolate op. test=develop --- paddle/fluid/operators/interpolate_op.cu | 46 ++++++++++++---------- paddle/fluid/platform/gpu_launch_config.h | 48 +++++++++++++++++++++++ 2 files changed, 74 insertions(+), 20 deletions(-) create mode 100644 paddle/fluid/platform/gpu_launch_config.h diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 6121389c128..d12a25f360e 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -9,9 +9,11 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_launch_config.h" namespace paddle { namespace operators { @@ -586,17 +588,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, int out_chw = c * out_hw; int pixelNum = n * out_chw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("nearest" == interp_method) { - KeNearestNeighborInterpFw< - T><<>>( + KeNearestNeighborInterpFw<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { - KeBilinearInterpFw< - T><<>>( + KeBilinearInterpFw<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); } @@ -696,12 +699,13 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, int out_cdhw = c * out_dhw; int pixelNum = n * out_cdhw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("trilinear" == interp_method) { - KeTrilinearInterpFw< - T><<>>( + KeTrilinearInterpFw<<>>( input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, align_mode, data_layout); @@ -787,17 +791,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, int out_chw = c * out_hw; int pixelNum = n * out_chw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("nearest" == interp_method) { - KeNearestNeighborInterpBw< - T><<>>( + KeNearestNeighborInterpBw<<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { - KeBilinearInterpBw< - T><<>>( + KeBilinearInterpBw<<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); @@ -892,12 +897,13 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, int out_cdhw = c * out_dhw; int pixelNum = n * out_cdhw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; + + platform::GpuLaunchConfig config = + platform::getGpuLaunchConfig(pixelNum, ctx); if ("trilinear" == interp_method) { - KeTrilinearInterpBw< - T><<>>( + KeTrilinearInterpBw<<>>( input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d, out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, align_mode, data_layout); diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h new file mode 100644 index 00000000000..d57478b8978 --- /dev/null +++ b/paddle/fluid/platform/gpu_launch_config.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace platform { + +struct GpuLaunchConfig { + // Number of threads per block. + int threads; + // Number of blocks for GPU kernel launch. + int blocks; + + GpuLaunchConfig(int threads, int blocks) : threads(threads), blocks(blocks) {} +}; + +inline GpuLaunchConfig getGpuLaunchConfig( + const int N, const framework::ExecutionContext& ctx) { + int threads = + std::min(1024, ctx.cuda_device_context().GetMaxThreadsPerBlock()); + int physical_thread_count = + std::min(ctx.cuda_device_context().GetMaxPhysicalThreadCount(), N); + int blocks = std::min((physical_thread_count + threads - 1) / threads, + ctx.cuda_device_context().GetSMCount()); + + GpuLaunchConfig config(threads, blocks); + + return config; +} + +} // namespace platform +} // namespace paddle -- GitLab