From 8a5a45f8bc6bf4188e6e314646d46ddc477fc0fd Mon Sep 17 00:00:00 2001 From: whs Date: Tue, 1 Jun 2021 23:45:26 +0800 Subject: [PATCH] Fix cuda kernel launch of grid sampler (#33100) (#33232) --- paddle/fluid/operators/grid_sampler_op.cu | 26 ++++++------ .../unittests/test_bilinear_interp_op.py | 2 + .../tests/unittests/test_grid_sampler_op.py | 42 ++++++++++++++++++- 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu index e9b0a0108a..762d14096a 100644 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ b/paddle/fluid/operators/grid_sampler_op.cu @@ -187,7 +187,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, int out_sC = out_h * out_w; int out_sH = out_w; int out_sW = 1; - CUDA_KERNEL_LOOP(index, nthreads) { const int w = index % out_w; const int h = (index / out_w) % out_h; @@ -199,7 +198,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, ix = compute_positions(ix, in_w, padding_mode, align_corners); iy = compute_positions(iy, in_h, padding_mode, align_corners); - if (mode == Mode::bilinear) { int ix_nw = static_cast(floor(ix)); int iy_nw = static_cast(floor(iy)); @@ -216,6 +214,7 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, T se = (ix - ix_nw) * (iy - iy_nw); auto inp_offset_NC = n * inp_sN; + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { @@ -291,17 +290,17 @@ class GridSampleOpCUDAKernel : public framework::OpKernel { << "; out_w: " << out_w; auto* output = ctx.Output("Output"); auto* output_data = output->mutable_data(ctx.GetPlace()); - - VLOG(3) << "set constant"; + VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1] + << "; " << output->dims()[2] << "; " << output->dims()[3]; math::SetConstant()( dev_ctx, output, static_cast(0)); int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - - int block = 512; - int grid_size = (count + block - 1) / block; - grid_sample_cuda_kernel<<>>( + int block_size = 512; + int grid_size = (count + block_size - 1) / block_size; + VLOG(3) << "cuda launch - grid dims: " << grid_size << "; block dims" + << block_size; + grid_sample_cuda_kernel<<>>( count, n, c, out_h, out_w, in_h, in_w, input->data(), grid->data(), output_data, mode, padding_mode, align_corners); } @@ -475,9 +474,12 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel { int count = static_cast(n * out_h * out_w); auto cu_stream = dev_ctx.stream(); - int block = 512; - int grid_size = (count + block - 1) / block; - grid_sampler_cuda_backward_kernel<<>>( + int block_size = 512; + int grid_size = (count + block_size - 1) / block_size; + VLOG(3) << "cuda launch grad kernel - grid dims: " << grid_size + << "; block dims" << block_size << "; count: " << count; + grid_sampler_cuda_backward_kernel< + T><<>>( count, output_grad->data(), input->data(), grid->data(), n, c, out_h, out_w, in_h, in_w, input_grad->data(), grid_grad_data, mode, padding_mode, align_corners); diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py index 287e85cb27..083b671c28 100755 --- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py +++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py @@ -19,6 +19,8 @@ import numpy as np from op_test import OpTest import paddle.fluid.core as core import paddle.fluid as fluid +import paddle +paddle.enable_static() def bilinear_interp_np(input, diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py index bf2f9518fb..1a62f11f59 100644 --- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py +++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py @@ -12,9 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle import unittest import numpy as np -from op_test import OpTest +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci +paddle.enable_static() def AffineGrid(theta, grid_shape): @@ -159,7 +162,6 @@ class TestGridSamplerOp(OpTest): "padding_mode": self.padding_mode, "mode": self.mode } - # print("X: {}".format(x)) self.outputs = { 'Output': GridSampler(x, grid, self.align_corners, self.mode, self.padding_mode) @@ -236,5 +238,41 @@ class Case4(TestGridSamplerOp): self.numeric_grad_delta = 0.0001 +@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " + + "however it is desirable to cover the forward pass") +class LargeInputCase(TestGridSamplerOp): + def get_places(self): + places = [] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def initTestCase(self): + self.no_need_check_grad = True + self.x_shape = (2, 3, 128, 128) + self.grid_shape = (2, 130, 130, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = False + self.padding_mode = "reflection" + self.mode = "bilinear" + + def test_check_grad_normal(self): + pass + + +@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " + + "however it is desirable to cover the forward pass") +class Case5(LargeInputCase): + def initTestCase(self): + self.no_need_check_grad = True + self.x_shape = (2, 3, 128, 128) + self.grid_shape = (2, 130, 130, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = True + self.padding_mode = "zeros" + self.mode = "bilinear" + self.use_cudnn = False if core.is_compiled_with_rocm() else True + + if __name__ == "__main__": unittest.main() -- GitLab