From 57e5f61ec8b6822bd897df15478c646cf347097b Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Wed, 23 Jan 2019 05:50:09 +0000 Subject: [PATCH] add gpu kernel, test=develop --- .../fluid/operators/detection/box_clip_op.cu | 74 +++++++++++++++++++ python/paddle/fluid/tests/test_detection.py | 3 +- 2 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/detection/box_clip_op.cu diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu new file mode 100644 index 00000000000..f10c92366de --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTenso = framework::LoDTensor; + +static constexpr int ImInfoSize = 3; + +template +static __global__ void GPUBoxClip(const T *input, const size_t *lod, + const size_t width, const T *im_info, + T *output) { + for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width; + i += BlockSize) { + int idx = lod[blockIdx.x] * width + i; + T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] / + im_info[blockIdx.x * ImInfoSize + 2]); + T im_h = round(im_info[blockIdx.x * ImInfoSize] / + im_info[blockIdx.x * ImInfoSize + 2]); + T im_size = (idx % 2 == 0) ? im_w : im_h; + output[idx] = max(min(input[idx], im_size - 1), T(0.)); + } +} + +template +class GPUBoxClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto *input = context.Input("Input"); + auto *im_info = context.Input("ImInfo"); + auto *output = context.Output("Output"); + const int64_t num = input->dims()[0]; + const int64_t bbox_width = input->numel() / num; + auto lod = input->lod(); + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + auto &dev_ctx = context.template device_context(); + auto stream = dev_ctx.stream(); + const size_t num_lod = lod.back().size() - 1; + T *output_data = output->mutable_data(dev_ctx.GetPlace()); + GPUBoxClip<<>>( + input->data(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()), + bbox_width, im_info->data(), output_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_clip, ops::GPUBoxClipKernel, + ops::GPUBoxClipKernel); diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index bbc372da1a8..4d8f2b1db1f 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -354,7 +354,8 @@ class TestGenerateProposals(unittest.TestCase): data_shape = [20, 64, 64] images = fluid.layers.data( name='images', shape=data_shape, dtype='float32') - im_info = fluid.layers.data(name='im_info', shape=[3], dtype='float32') + im_info = fluid.layers.data( + name='im_info', shape=[1, 3], dtype='float32') anchors, variances = fluid.layers.anchor_generator( name='anchor_generator', input=images, -- GitLab