From 023ff4f5a49823536d427ad898534bb90ca53372 Mon Sep 17 00:00:00 2001
From: Tomasz Socha <tomasz.socha@intel.com>
Date: Thu, 16 Dec 2021 10:35:35 +0100
Subject: [PATCH] Faster implementation of CPU kernel for ROI Align operator
 (#37848)

* Faster implementation of CPU kernel for ROI_ALIGN Operator

* Add missing variable to CUDA roi_align_op

* Style

* Fix boundaries

* Rename variables for indexes calculation

* Remove unnecessary emplace

* Revert "Remove unnecessary emplace"

This reverts commit c10e87f7fb812f1a672fde32f2690a97d47e2f5a.

* Style
---
 paddle/fluid/operators/roi_align_op.cu |   1 +
 paddle/fluid/operators/roi_align_op.h  | 230 +++++++++++++++----------
 2 files changed, 143 insertions(+), 88 deletions(-)
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index a08339d776..3b25676fb0 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -26,6 +26,7 @@ using LoDTensor = framework::LoDTensor;
 
 static constexpr int kNumCUDAThreads = 512;
 static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 4;
 
 static inline int NumBlocks(const int N) {
   return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 29c9268d52..1ab5ddc83f 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <limits>
+#include <numeric>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -22,72 +23,150 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-static constexpr int kROISize = 4;
+namespace {
+constexpr size_t get_offset(size_t x, size_t y, size_t width) {
+  return y * width + x;
+}
 
 template <class T>
-void PreCalcForBilinearInterpolate(
-    const platform::DeviceContext& ctx, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int iy_upper,
-    const int ix_upper, T roi_ymin, T roi_xmin, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, Tensor* pre_pos, Tensor* pre_w) {
-  int pre_calc_index = 0;
-  int* pre_pos_data = pre_pos->mutable_data<int>(ctx.GetPlace());
-  T* pre_w_data = pre_w->mutable_data<T>(ctx.GetPlace());
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        // calculate y of sample points
-        T y = roi_ymin + ph * bin_size_h +
-              static_cast<T>(iy + .5f) * bin_size_h /
-                  static_cast<T>(roi_bin_grid_h);
-        // calculate x of samle points
-        for (int ix = 0; ix < ix_upper; ix++) {
-          T x = roi_xmin + pw * bin_size_w +
-                static_cast<T>(ix + .5f) * bin_size_w /
-                    static_cast<T>(roi_bin_grid_w);
+struct offsets_and_ratios {
+  offsets_and_ratios() = default;
+  offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy,
+                     std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio,
+                     T XY_ratio)
+      : xy(xy),
+        xY(xY),
+        Xy(Xy),
+        XY(XY),
+        xy_ratio(xy_ratio),
+        xY_ratio(xY_ratio),
+        Xy_ratio(Xy_ratio),
+        XY_ratio(XY_ratio){};
+
+  std::size_t xy = 0;
+  std::size_t xY = 0;
+  std::size_t Xy = 0;
+  std::size_t XY = 0;
+  T xy_ratio = 0.0f;
+  T xY_ratio = 0.0f;
+  T Xy_ratio = 0.0f;
+  T XY_ratio = 0.0f;
+};
+
+template <typename T>
+std::vector<offsets_and_ratios<T>> get_indexes_and_ratios(
+    std::size_t width, std::size_t height, const T roi_width,
+    const T roi_height, const T roi_xmin, const T roi_ymin,
+    std::size_t pooled_width, std::size_t roi_bin_grid_w,
+    std::size_t pooled_height, std::size_t roi_bin_grid_h) {
+  const auto ind_num =
+      pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h;
+
+  std::vector<offsets_and_ratios<T>> interpolation_cords;
+  interpolation_cords.reserve(ind_num);
+
+  const auto bin_w = roi_width / pooled_width;
+  const auto bin_h = roi_height / pooled_height;
+
+  for (std::size_t py = 0; py < pooled_height; py++) {
+    for (std::size_t px = 0; px < pooled_width; px++) {
+      for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) {
+        // calculate x of sample points
+        auto y =
+            roi_ymin +
+            bin_h * (py +
+                     static_cast<T>(iy + .5f) / static_cast<T>(roi_bin_grid_h));
+        for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) {
+          // calculate x of sample points
+          auto x = roi_xmin +
+                   bin_w * (px +
+                            static_cast<T>(ix + .5f) /
+                                static_cast<T>(roi_bin_grid_w));
+
           // deal with elements out of map
           if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            for (int i = 0; i < kROISize; ++i) {
-              pre_pos_data[i + pre_calc_index * kROISize] = 0;
-              pre_w_data[i + pre_calc_index * kROISize] = 0;
-            }
-            pre_calc_index += 1;
+            interpolation_cords.emplace_back();
             continue;
           }
           y = y <= 0 ? 0 : y;
           x = x <= 0 ? 0 : x;
 
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high;
-          int x_high;
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = static_cast<T>(y_low);
+          std::size_t x_low_index = static_cast<std::size_t>(x);
+          std::size_t x_high_index;
+          if (x_low_index >= width - 1) {
+            x_high_index = x_low_index = width - 1;
+            x = static_cast<T>(x_low_index);
           } else {
-            y_high = y_low + 1;
+            x_high_index = x_low_index + 1;
           }
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = static_cast<T>(x_low);
+          T x_ratio = x_high_index - x;
+
+          std::size_t y_low_index = static_cast<std::size_t>(y);
+          std::size_t y_high_index;
+          if (y_low_index >= height - 1) {
+            y_high_index = y_low_index = height - 1;
+            y = static_cast<T>(y_low_index);
           } else {
-            x_high = x_low + 1;
+            y_high_index = y_low_index + 1;
           }
-          T ly = y - y_low, lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low;
-          pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high;
-          pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low;
-          pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high;
-          pre_w_data[pre_calc_index * kROISize] = hy * hx;
-          pre_w_data[pre_calc_index * kROISize + 1] = hy * lx;
-          pre_w_data[pre_calc_index * kROISize + 2] = ly * hx;
-          pre_w_data[pre_calc_index * kROISize + 3] = ly * lx;
-          pre_calc_index += 1;
+          T y_ratio = y_high_index - y;
+
+          auto xy = get_offset(x_low_index, y_low_index, width);
+          auto xY = get_offset(x_low_index, y_high_index, width);
+          auto Xy = get_offset(x_high_index, y_low_index, width);
+          auto XY = get_offset(x_high_index, y_high_index, width);
+
+          auto xy_ratio = x_ratio * y_ratio;
+          auto xY_ratio = x_ratio * (1 - y_ratio);
+          auto Xy_ratio = (1 - x_ratio) * y_ratio;
+          auto XY_ratio = (1 - x_ratio) * (1 - y_ratio);
+
+          interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio,
+                                           Xy_ratio, XY_ratio);
         }
       }
     }
   }
+  return interpolation_cords;
+}
+
+template <typename T>
+void interpolate(std::vector<T>& interpolated_values,
+                 const std::vector<offsets_and_ratios<T>>& interpolation_cords,
+                 const T* data) {
+  for (auto& ic : interpolation_cords) {
+    auto xlyl_offset = ic.xy;
+    auto xhyl_offset = ic.Xy;
+    auto xlyh_offset = ic.xY;
+    auto xhyh_offset = ic.XY;
+
+    auto xlyl_ratio = ic.xy_ratio;
+    auto xhyl_ratio = ic.Xy_ratio;
+    auto xlyh_ratio = ic.xY_ratio;
+    auto xhyh_ratio = ic.XY_ratio;
+
+    interpolated_values.emplace_back(
+        xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] +
+        xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]);
+  }
+}
+
+template <typename T>
+void avg_pool(const std::vector<T>& interpolated_values, T* output_data,
+              int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width,
+              int pooled_height) {
+  const auto data_amount = pooled_width * pooled_height;
+  const auto grid_points = roi_bin_grid_w * roi_bin_grid_h;
+  const T count = 1.0 / grid_points;
+  auto val_begin = interpolated_values.cbegin();
+  for (auto i = 0; i < data_amount; ++i) {
+    T sum = 0.0;
+    auto val_end = val_begin + grid_points;
+    sum = std::accumulate(val_begin, val_end, sum);
+    val_begin = val_end;
+    output_data[i] = sum * count;
+  }
+}
 }
 
 template <class T>
@@ -147,8 +226,6 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
     auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
     auto aligned = ctx.Attr<bool>("aligned");
 
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
     auto in_dims = in->dims();
     int batch_size = in_dims[0];
     int channels = in_dims[1];
@@ -209,7 +286,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
               "of rois from RoIsLoD is %d",
               rois_num, rois_num_with_lod));
       for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
           roi_batch_id_data[i] = n;
         }
       }
@@ -231,8 +308,6 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
         roi_height = std::max(roi_height, static_cast<T>(1.));
       }
 
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
       const T* batch_data = input_data + roi_batch_id * in_stride[0];
 
       int roi_bin_grid_h = (sampling_ratio > 0)
@@ -241,41 +316,20 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
       int roi_bin_grid_w = (sampling_ratio > 0)
                                ? sampling_ratio
                                : ceil(roi_width / pooled_width);
-      const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);
-      Tensor pre_pos;
-      Tensor pre_w;
-      int pre_size = count * out_stride[1];
-      pre_pos.Resize({pre_size, kROISize});
-      pre_w.Resize({pre_size, kROISize});
-
-      PreCalcForBilinearInterpolate(
-          dev_ctx, height, width, pooled_height, pooled_width, roi_bin_grid_h,
-          roi_bin_grid_w, roi_ymin, roi_xmin, bin_size_h, bin_size_w,
-          roi_bin_grid_h, roi_bin_grid_w, &pre_pos, &pre_w);
-      const int* pre_pos_data = pre_pos.data<int>();
-      const T* pre_w_data = pre_w.data<T>();
-      for (int c = 0; c < channels; c++) {
-        int pre_calc_index = 0;
-        for (int ph = 0; ph < pooled_height; ph++) {
-          for (int pw = 0; pw < pooled_width; pw++) {
-            const int pool_index = ph * pooled_width + pw;
-            T output_val = 0;
-            for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-              for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-                for (int i = 0; i < kROISize; i++) {
-                  int pos = pre_pos_data[pre_calc_index * kROISize + i];
-                  T w = pre_w_data[pre_calc_index * kROISize + i];
-                  output_val += w * batch_data[pos];
-                }
-                pre_calc_index += 1;
-              }
-            }
-            output_val /= count;
-            output_data[pool_index] = output_val;
-          }
-        }
+
+      auto interpolation_cords = get_indexes_and_ratios(
+          width, height, roi_width, roi_height, roi_xmin, roi_ymin,
+          pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h);
+
+      std::vector<T> interpolated_values;
+      interpolated_values.reserve(interpolation_cords.size());
+      for (auto channel = 0; channel < channels; ++channel) {
+        interpolate(interpolated_values, interpolation_cords, batch_data);
+        avg_pool(interpolated_values, output_data, roi_bin_grid_w,
+                 roi_bin_grid_h, pooled_width, pooled_height);
         batch_data += in_stride[1];
         output_data += out_stride[1];
+        interpolated_values.clear();
       }
       rois_data += roi_stride[0];
     }
@@ -328,7 +382,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
       auto rois_lod = rois->lod().back();
       rois_batch_size = rois_lod.size() - 1;
       for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
           roi_batch_id_data[i] = n;
         }
       }
-- 
GitLab