distribute_fpn_proposals_op.cu 8.4 KB
Newer Older
J
jerrywgz 已提交
1
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
J
jerrywgz 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <paddle/fluid/memory/allocation/allocator.h>
#include "cub/cub.cuh"
#include "paddle/fluid/memory/memcpy.h"
18
#include "paddle/fluid/operators/detection/bbox_util.h"
J
jerrywgz 已提交
19 20
#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
#include "paddle/fluid/operators/gather.cu.h"
21
#include "paddle/fluid/operators/math/math_function.h"
J
jerrywgz 已提交
22 23 24 25 26 27 28 29 30
#include "paddle/fluid/platform/cuda_primitives.h"
#include "paddle/fluid/platform/for_range.h"

namespace paddle {
namespace operators {

using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;

31
static constexpr int kNumCUDAThreads = 64;
J
jerrywgz 已提交
32 33 34 35 36 37 38 39 40 41
static constexpr int kNumMaxinumNumBlocks = 4096;

int const BBoxSize = 4;

static inline int NumBlocks(const int N) {
  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
                  kNumMaxinumNumBlocks);
}

template <class T>
42
__global__ void GPUDistFpnProposalsHelper(
J
jerrywgz 已提交
43 44 45
    const int nthreads, const T* rois, const int lod_size,
    const int refer_level, const int refer_scale, const int max_level,
    const int min_level, int* roi_batch_id_data, int* sub_lod_list,
46
    int* target_lvls, bool pixel_offset = true) {
47
  CUDA_KERNEL_LOOP(i, nthreads) {
J
jerrywgz 已提交
48 49
    const T* offset_roi = rois + i * BBoxSize;
    int roi_batch_ind = roi_batch_id_data[i];
J
jerrywgz 已提交
50
    // get the target level of current rois
51
    T roi_area = RoIArea(offset_roi, pixel_offset);
J
jerrywgz 已提交
52
    T roi_scale = sqrt(roi_area);
53
    int tgt_lvl = floor(
54
        log2(roi_scale / static_cast<T>(refer_scale) + (T)1e-8) + refer_level);
J
jerrywgz 已提交
55 56
    tgt_lvl = min(max_level, max(tgt_lvl, min_level));
    target_lvls[i] = tgt_lvl;
J
jerrywgz 已提交
57
    // compute number of rois in the same batch and same target level
58 59
    platform::CudaAtomicAdd(
        sub_lod_list + (tgt_lvl - min_level) * lod_size + roi_batch_ind, 1);
J
jerrywgz 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
  }
}

template <typename DeviceContext, typename T>
class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* fpn_rois = ctx.Input<paddle::framework::LoDTensor>("FpnRois");

    auto multi_fpn_rois = ctx.MultiOutput<LoDTensor>("MultiFpnRois");
    auto* restore_index = ctx.Output<Tensor>("RestoreIndex");

    const int min_level = ctx.Attr<int>("min_level");
    const int max_level = ctx.Attr<int>("max_level");
    const int refer_level = ctx.Attr<int>("refer_level");
    const int refer_scale = ctx.Attr<int>("refer_scale");
76
    const bool pixel_offset = ctx.Attr<bool>("pixel_offset");
J
jerrywgz 已提交
77 78 79
    int num_level = max_level - min_level + 1;

    // check that the fpn_rois is not empty
80 81 82 83 84 85
    if (!ctx.HasInput("RoisNum")) {
      PADDLE_ENFORCE_EQ(
          fpn_rois->lod().size(), 1UL,
          platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
                                            "with one level"));
    }
J
jerrywgz 已提交
86

87 88 89 90 91 92 93
    std::vector<size_t> fpn_rois_lod;
    if (ctx.HasInput("RoisNum")) {
      auto* rois_num = ctx.Input<Tensor>("RoisNum");
      fpn_rois_lod = GetLodFromRoisNum(rois_num);
    } else {
      fpn_rois_lod = fpn_rois->lod().back();
    }
J
jerrywgz 已提交
94 95 96 97 98
    int lod_size = fpn_rois_lod.size() - 1;
    int roi_num = fpn_rois_lod[lod_size];

    auto& dev_ctx = ctx.template device_context<DeviceContext>();

J
jerrywgz 已提交
99
    // get batch id by lod in CPU
J
jerrywgz 已提交
100 101 102 103 104 105 106 107 108
    Tensor roi_batch_id_list;
    roi_batch_id_list.Resize({roi_num});
    int* roi_batch_id_data =
        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
    for (int n = 0; n < lod_size; ++n) {
      for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) {
        roi_batch_id_data[i] = n;
      }
    }
J
jerrywgz 已提交
109
    // copy batch id list to GPU
J
jerrywgz 已提交
110 111 112 113 114 115 116
    Tensor roi_batch_id_list_gpu;
    framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(),
                              &roi_batch_id_list_gpu);

    Tensor sub_lod_list;
    sub_lod_list.Resize({num_level, lod_size});
    int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
117 118 119
    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
    set_zero(dev_ctx, &sub_lod_list, static_cast<int>(0));

J
jerrywgz 已提交
120 121 122 123
    Tensor target_lvls;
    target_lvls.Resize({roi_num});
    int* target_lvls_data = target_lvls.mutable_data<int>(dev_ctx.GetPlace());

124
    int dist_blocks = NumBlocks(roi_num);
J
jerrywgz 已提交
125
    int threads = kNumCUDAThreads;
J
jerrywgz 已提交
126
    // get target levels and sub_lod list
127
    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
J
jerrywgz 已提交
128 129
        roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
        max_level, min_level, roi_batch_id_list_gpu.data<int>(),
130
        sub_lod_list_data, target_lvls_data, pixel_offset);
131
    dev_ctx.Wait();
132
    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
J
jerrywgz 已提交
133 134 135 136 137 138 139 140 141 142 143 144 145

    Tensor index_in_t;
    int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, roi_num);
    for_range(RangeInitFunctor{0, 1, idx_in});

    Tensor keys_out_t;
    int* keys_out = keys_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
    Tensor index_out_t;
    int* idx_out = index_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());

    // Determine temporary device storage requirements
    size_t temp_storage_bytes = 0;
146 147 148
    cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
                                              target_lvls_data, keys_out,
                                              idx_in, idx_out, roi_num);
J
jerrywgz 已提交
149
    // Allocate temporary storage
150
    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
J
jerrywgz 已提交
151 152

    // Run sorting operation
J
jerrywgz 已提交
153
    // sort target level to get corresponding index
154
    cub::DeviceRadixSort::SortPairs<int, int>(
J
jerrywgz 已提交
155 156 157 158 159
        d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
        idx_in, idx_out, roi_num);

    int* restore_idx_data =
        restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
J
jerrywgz 已提交
160
    // sort current index to get restore index
161
    cub::DeviceRadixSort::SortPairs<int, int>(
J
jerrywgz 已提交
162 163 164
        d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
        restore_idx_data, roi_num);

165
    int start = 0;
166 167
    auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");

J
jerrywgz 已提交
168 169 170
    for (int i = 0; i < num_level; ++i) {
      Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
      int* sub_lod_data = sub_lod.data<int>();
J
jerrywgz 已提交
171
      // transfer length-based lod to offset-based lod
172 173 174 175 176 177 178 179
      std::vector<size_t> offset(1, 0);
      std::vector<int> sub_lod_cpu(lod_size);
      memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
                   sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
      dev_ctx.Wait();
      for (int j = 0; j < lod_size; ++j) {
        offset.emplace_back(offset.back() + sub_lod_cpu[j]);
      }
J
jerrywgz 已提交
180

181 182 183 184 185 186 187 188 189 190 191 192 193
      int sub_rois_num = offset.back();

      int end = start + sub_rois_num;
      if (end > start) {
        Tensor sub_idx = index_out_t.Slice(start, end);
        start = end;
        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                           dev_ctx.GetPlace());
        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
      } else {
        multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                           dev_ctx.GetPlace());
      }
194 195 196 197 198
      if (multi_rois_num.size() > 0) {
        Tensor* rois_num_t = multi_rois_num[i];
        TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t);
        rois_num_t->Resize({lod_size});
      }
J
jerrywgz 已提交
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
      framework::LoD lod;
      lod.emplace_back(offset);
      multi_fpn_rois[i]->set_lod(lod);
    }
  }
};

}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
    distribute_fpn_proposals,
    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
                                           float>,
    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
                                           double>);