prior_box_kernel.cpp 6.3 KB
Newer Older
Y
yangfei 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#ifdef PRIORBOX_OP

#include "operators/kernel/prior_box_kernel.h"
18
#include "framework/cl/cl_tensor.h"
Y
yangfei 已提交
19
namespace paddle_mobile {
Y
yangfei 已提交
20
namespace operators {
Y
yangfei 已提交
21

Y
yangfei 已提交
22 23
template <>
bool PriorBoxKernel<GPU_CL, float>::Init(PriorBoxParam<GPU_CL> *param) {
24
  this->cl_helper_.AddKernel("prior_box", "prior_box_kernel.cl");
Y
yangfei 已提交
25 26
  return true;
}
Y
yangfei 已提交
27

Y
yangfei 已提交
28 29
template <>
void PriorBoxKernel<GPU_CL, float>::Compute(
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
    const PriorBoxParam<GPU_CL> &param) {
  const auto *input_ = param.Input();
  const auto &input_dims = input_->dims();

  const auto &input_image_dims = param.InputImage()->dims();

  const auto &min_sizes = param.MinSizes();
  const auto &max_sizes = param.MaxSizes();
  const auto &variances = param.Variances();
  const auto &input_aspect_ratio = param.AspectRatios();
  const bool &flip = param.Flip();
  const bool &clip = param.Clip();
  const float &step_w = param.StepW();
  const float &step_h = param.StepH();
  const float &offset = param.Offset();
  const int C = param.OutputBoxes()->dims()[1];

  auto output_boxes = param.OutputBoxes()->GetCLImage();
  auto output_variances = param.OutputVariances()->GetCLImage();

  std::vector<float> aspect_ratios;
  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);

  auto img_width = input_image_dims[3];
  auto img_height = input_image_dims[2];

  auto feature_width = input_dims[3];
  auto feature_height = input_dims[2];

  float step_width, step_height;
  /// 300 / 19
  if (step_w == 0 || step_h == 0) {
    step_width = static_cast<float>(img_width) / feature_width;
    step_height = static_cast<float>(img_height) / feature_height;
  } else {
    step_width = step_w;
    step_height = step_h;
  }

  int num_priors = aspect_ratios.size() * min_sizes.size();
  if (!max_sizes.empty()) {
    num_priors += max_sizes.size();
  }

  float *box_width = static_cast<float *>(
      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
  float *box_height = static_cast<float *>(
      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
  int idx = 0;
  for (size_t s = 0; s < min_sizes.size(); ++s) {
    auto min_size = min_sizes[s];
    if (param.MinMaxAspectRatiosOrder()) {
      box_width[idx] = box_height[idx] = min_size / 2.;
      idx++;
      if (max_sizes.size() > 0) {
        auto max_size = max_sizes[s];
        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
        idx++;
      }
      for (float ar : aspect_ratios) {
        if (fabs(ar - 1.) < 1e-6) {
          continue;
        }
        box_width[idx] = min_size * sqrt(ar) / 2.;
        box_height[idx] = min_size / sqrt(ar) / 2.;
        idx++;
      }

    } else {
      for (float ar : aspect_ratios) {
        box_width[idx] = min_size * sqrt(ar) / 2.;
        box_height[idx] = min_size / sqrt(ar) / 2.;
        idx++;
      }
      if (!max_sizes.empty()) {
        auto max_size = max_sizes[s];
        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
        idx++;
      }
    }
  }
  cl_int status;
  auto kernel = this->cl_helper_.KernelAt(0);
  auto default_work_size =
      this->cl_helper_.DefaultWorkSize(*param.OutputBoxes());
  int c_block = default_work_size[0];
  int w = default_work_size[1];
  int nh = default_work_size[2];

  std::vector<int64_t> box_shape({1, 1, 1, num_priors});
  framework::DDim ddim = framework::make_ddim(box_shape);

  framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(),
                                          this->cl_helper_.CLCommandQueue());
  box_width_cl_tensor.Resize(ddim);
  cl_mem box_width_Buffer =
      box_width_cl_tensor.mutable_with_data<float>(box_width);

  framework::CLTensor box_height_cl_tensor(this->cl_helper_.CLContext(),
                                           this->cl_helper_.CLCommandQueue());
  box_height_cl_tensor.Resize(ddim);
  cl_mem box_height_Buffer =
      box_height_cl_tensor.mutable_with_data<float>(box_height);

  DLOG << "c_block:" << c_block;
  DLOG << "w:" << w;
  DLOG << "nh:" << nh;
  DLOG << "step_width:" << step_width;
  DLOG << "step_height:" << step_height;
  DLOG << "offset:" << offset;
  DLOG << "img_width:" << img_width;
  DLOG << "img_height:" << img_height;
  DLOG << "num_priors:" << num_priors;
  DLOG << "C:" << C;
  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &box_width_Buffer);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output_boxes);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 6, sizeof(float), &step_width);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 7, sizeof(float), &step_height);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 8, sizeof(float), &offset);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 9, sizeof(int), &img_width);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 10, sizeof(int), &img_height);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 11, sizeof(int), &num_priors);
  CL_CHECK_ERRORS(status);
  status = clSetKernelArg(kernel, 12, sizeof(int), &C);
  CL_CHECK_ERRORS(status);
  size_t global_work_size[2] = {c_block, nh};
  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                  NULL, global_work_size, NULL, 0, NULL, NULL);
  CL_CHECK_ERRORS(status);
  paddle_mobile::memory::Free(box_width);
  paddle_mobile::memory::Free(box_height);
}
Y
yangfei 已提交
177
template class PriorBoxKernel<GPU_CL, float>;
Y
yangfei 已提交
178

Y
yangfei 已提交
179
}  // namespace operators
Y
yangfei 已提交
180 181 182
}  // namespace paddle_mobile

#endif