未验证 提交 023ff4f5 编写于 作者: T Tomasz Socha 提交者: GitHub

Faster implementation of CPU kernel for ROI Align operator (#37848)

* Faster implementation of CPU kernel for ROI_ALIGN Operator

* Add missing variable to CUDA roi_align_op

* Style

* Fix boundaries

* Rename variables for indexes calculation

* Remove unnecessary emplace

* Revert "Remove unnecessary emplace"

This reverts commit c10e87f7fb812f1a672fde32f2690a97d47e2f5a.

* Style
上级 2e76d5ad
...@@ -26,6 +26,7 @@ using LoDTensor = framework::LoDTensor; ...@@ -26,6 +26,7 @@ using LoDTensor = framework::LoDTensor;
static constexpr int kNumCUDAThreads = 512; static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096; static constexpr int kNumMaxinumNumBlocks = 4096;
static constexpr int kROISize = 4;
static inline int NumBlocks(const int N) { static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
......
...@@ -12,6 +12,7 @@ limitations under the License. */ ...@@ -12,6 +12,7 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <numeric>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
...@@ -22,72 +23,150 @@ namespace operators { ...@@ -22,72 +23,150 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
static constexpr int kROISize = 4; namespace {
constexpr size_t get_offset(size_t x, size_t y, size_t width) {
return y * width + x;
}
template <class T> template <class T>
void PreCalcForBilinearInterpolate( struct offsets_and_ratios {
const platform::DeviceContext& ctx, const int height, const int width, offsets_and_ratios() = default;
const int pooled_height, const int pooled_width, const int iy_upper, offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy,
const int ix_upper, T roi_ymin, T roi_xmin, T bin_size_h, T bin_size_w, std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio,
int roi_bin_grid_h, int roi_bin_grid_w, Tensor* pre_pos, Tensor* pre_w) { T XY_ratio)
int pre_calc_index = 0; : xy(xy),
int* pre_pos_data = pre_pos->mutable_data<int>(ctx.GetPlace()); xY(xY),
T* pre_w_data = pre_w->mutable_data<T>(ctx.GetPlace()); Xy(Xy),
for (int ph = 0; ph < pooled_height; ph++) { XY(XY),
for (int pw = 0; pw < pooled_width; pw++) { xy_ratio(xy_ratio),
for (int iy = 0; iy < iy_upper; iy++) { xY_ratio(xY_ratio),
// calculate y of sample points Xy_ratio(Xy_ratio),
T y = roi_ymin + ph * bin_size_h + XY_ratio(XY_ratio){};
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); std::size_t xy = 0;
// calculate x of samle points std::size_t xY = 0;
for (int ix = 0; ix < ix_upper; ix++) { std::size_t Xy = 0;
T x = roi_xmin + pw * bin_size_w + std::size_t XY = 0;
static_cast<T>(ix + .5f) * bin_size_w / T xy_ratio = 0.0f;
static_cast<T>(roi_bin_grid_w); T xY_ratio = 0.0f;
T Xy_ratio = 0.0f;
T XY_ratio = 0.0f;
};
template <typename T>
std::vector<offsets_and_ratios<T>> get_indexes_and_ratios(
std::size_t width, std::size_t height, const T roi_width,
const T roi_height, const T roi_xmin, const T roi_ymin,
std::size_t pooled_width, std::size_t roi_bin_grid_w,
std::size_t pooled_height, std::size_t roi_bin_grid_h) {
const auto ind_num =
pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
std::vector<offsets_and_ratios<T>> interpolation_cords;
interpolation_cords.reserve(ind_num);
const auto bin_w = roi_width / pooled_width;
const auto bin_h = roi_height / pooled_height;
for (std::size_t py = 0; py < pooled_height; py++) {
for (std::size_t px = 0; px < pooled_width; px++) {
for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
// calculate x of sample points
auto y =
roi_ymin +
bin_h * (py +
static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
// calculate x of sample points
auto x = roi_xmin +
bin_w * (px +
static_cast<T>(ix + .5f) /
static_cast<T>(roi_bin_grid_w));
// deal with elements out of map // deal with elements out of map
if (y < -1.0 || y > height || x < -1.0 || x > width) { if (y < -1.0 || y > height || x < -1.0 || x > width) {
for (int i = 0; i < kROISize; ++i) { interpolation_cords.emplace_back();
pre_pos_data[i + pre_calc_index * kROISize] = 0;
pre_w_data[i + pre_calc_index * kROISize] = 0;
}
pre_calc_index += 1;
continue; continue;
} }
y = y <= 0 ? 0 : y; y = y <= 0 ? 0 : y;
x = x <= 0 ? 0 : x; x = x <= 0 ? 0 : x;
int y_low = static_cast<int>(y); std::size_t x_low_index = static_cast<std::size_t>(x);
int x_low = static_cast<int>(x); std::size_t x_high_index;
int y_high; if (x_low_index >= width - 1) {
int x_high; x_high_index = x_low_index = width - 1;
if (y_low >= height - 1) { x = static_cast<T>(x_low_index);
y_high = y_low = height - 1;
y = static_cast<T>(y_low);
} else { } else {
y_high = y_low + 1; x_high_index = x_low_index + 1;
} }
if (x_low >= width - 1) { T x_ratio = x_high_index - x;
x_high = x_low = width - 1;
x = static_cast<T>(x_low); std::size_t y_low_index = static_cast<std::size_t>(y);
std::size_t y_high_index;
if (y_low_index >= height - 1) {
y_high_index = y_low_index = height - 1;
y = static_cast<T>(y_low_index);
} else { } else {
x_high = x_low + 1; y_high_index = y_low_index + 1;
} }
T ly = y - y_low, lx = x - x_low; T y_ratio = y_high_index - y;
T hy = 1. - ly, hx = 1. - lx;
pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low; auto xy = get_offset(x_low_index, y_low_index, width);
pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high; auto xY = get_offset(x_low_index, y_high_index, width);
pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low; auto Xy = get_offset(x_high_index, y_low_index, width);
pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high; auto XY = get_offset(x_high_index, y_high_index, width);
pre_w_data[pre_calc_index * kROISize] = hy * hx;
pre_w_data[pre_calc_index * kROISize + 1] = hy * lx; auto xy_ratio = x_ratio * y_ratio;
pre_w_data[pre_calc_index * kROISize + 2] = ly * hx; auto xY_ratio = x_ratio * (1 - y_ratio);
pre_w_data[pre_calc_index * kROISize + 3] = ly * lx; auto Xy_ratio = (1 - x_ratio) * y_ratio;
pre_calc_index += 1; auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio,
Xy_ratio, XY_ratio);
} }
} }
} }
} }
return interpolation_cords;
}
template <typename T>
void interpolate(std::vector<T>& interpolated_values,
const std::vector<offsets_and_ratios<T>>& interpolation_cords,
const T* data) {
for (auto& ic : interpolation_cords) {
auto xlyl_offset = ic.xy;
auto xhyl_offset = ic.Xy;
auto xlyh_offset = ic.xY;
auto xhyh_offset = ic.XY;
auto xlyl_ratio = ic.xy_ratio;
auto xhyl_ratio = ic.Xy_ratio;
auto xlyh_ratio = ic.xY_ratio;
auto xhyh_ratio = ic.XY_ratio;
interpolated_values.emplace_back(
xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
}
}
template <typename T>
void avg_pool(const std::vector<T>& interpolated_values, T* output_data,
int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width,
int pooled_height) {
const auto data_amount = pooled_width * pooled_height;
const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
const T count = 1.0 / grid_points;
auto val_begin = interpolated_values.cbegin();
for (auto i = 0; i < data_amount; ++i) {
T sum = 0.0;
auto val_end = val_begin + grid_points;
sum = std::accumulate(val_begin, val_end, sum);
val_begin = val_end;
output_data[i] = sum * count;
}
}
} }
template <class T> template <class T>
...@@ -147,8 +226,6 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> { ...@@ -147,8 +226,6 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
auto sampling_ratio = ctx.Attr<int>("sampling_ratio"); auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto aligned = ctx.Attr<bool>("aligned"); auto aligned = ctx.Attr<bool>("aligned");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto in_dims = in->dims(); auto in_dims = in->dims();
int batch_size = in_dims[0]; int batch_size = in_dims[0];
int channels = in_dims[1]; int channels = in_dims[1];
...@@ -209,7 +286,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> { ...@@ -209,7 +286,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
"of rois from RoIsLoD is %d", "of rois from RoIsLoD is %d",
rois_num, rois_num_with_lod)); rois_num, rois_num_with_lod));
for (int n = 0; n < rois_batch_size; ++n) { for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n; roi_batch_id_data[i] = n;
} }
} }
...@@ -231,8 +308,6 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> { ...@@ -231,8 +308,6 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
roi_height = std::max(roi_height, static_cast<T>(1.)); roi_height = std::max(roi_height, static_cast<T>(1.));
} }
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* batch_data = input_data + roi_batch_id * in_stride[0]; const T* batch_data = input_data + roi_batch_id * in_stride[0];
int roi_bin_grid_h = (sampling_ratio > 0) int roi_bin_grid_h = (sampling_ratio > 0)
...@@ -241,41 +316,20 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> { ...@@ -241,41 +316,20 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
int roi_bin_grid_w = (sampling_ratio > 0) int roi_bin_grid_w = (sampling_ratio > 0)
? sampling_ratio ? sampling_ratio
: ceil(roi_width / pooled_width); : ceil(roi_width / pooled_width);
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);
Tensor pre_pos; auto interpolation_cords = get_indexes_and_ratios(
Tensor pre_w; width, height, roi_width, roi_height, roi_xmin, roi_ymin,
int pre_size = count * out_stride[1]; pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h);
pre_pos.Resize({pre_size, kROISize});
pre_w.Resize({pre_size, kROISize}); std::vector<T> interpolated_values;
interpolated_values.reserve(interpolation_cords.size());
PreCalcForBilinearInterpolate( for (auto channel = 0; channel < channels; ++channel) {
dev_ctx, height, width, pooled_height, pooled_width, roi_bin_grid_h, interpolate(interpolated_values, interpolation_cords, batch_data);
roi_bin_grid_w, roi_ymin, roi_xmin, bin_size_h, bin_size_w, avg_pool(interpolated_values, output_data, roi_bin_grid_w,
roi_bin_grid_h, roi_bin_grid_w, &pre_pos, &pre_w); roi_bin_grid_h, pooled_width, pooled_height);
const int* pre_pos_data = pre_pos.data<int>();
const T* pre_w_data = pre_w.data<T>();
for (int c = 0; c < channels; c++) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
const int pool_index = ph * pooled_width + pw;
T output_val = 0;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
for (int i = 0; i < kROISize; i++) {
int pos = pre_pos_data[pre_calc_index * kROISize + i];
T w = pre_w_data[pre_calc_index * kROISize + i];
output_val += w * batch_data[pos];
}
pre_calc_index += 1;
}
}
output_val /= count;
output_data[pool_index] = output_val;
}
}
batch_data += in_stride[1]; batch_data += in_stride[1];
output_data += out_stride[1]; output_data += out_stride[1];
interpolated_values.clear();
} }
rois_data += roi_stride[0]; rois_data += roi_stride[0];
} }
...@@ -328,7 +382,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> { ...@@ -328,7 +382,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
auto rois_lod = rois->lod().back(); auto rois_lod = rois->lod().back();
rois_batch_size = rois_lod.size() - 1; rois_batch_size = rois_lod.size() - 1;
for (int n = 0; n < rois_batch_size; ++n) { for (int n = 0; n < rois_batch_size; ++n) {
for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
roi_batch_id_data[i] = n; roi_batch_id_data[i] = n;
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册