Add roi_perspective_transform op

cbf8a973 · hjchen2 · 91c5d1a0 · cbf8a973 · cbf8a973 · cbf8a973
9 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -108,6 +108,7 @@ const char *G_OP_TYPE_SLICE = "slice";
 const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
 const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
 const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
+const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -204,5 +205,6 @@ std::unordered_map<
        {G_OP_TYPE_GENERATE_PROPOSALS,
         {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
          {"RpnRois", "RpnRoiProbs"}}},
-        {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}}};
+        {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
+        {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -196,6 +196,7 @@ extern const char *G_OP_TYPE_SLICE;
 extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
 extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
 extern const char *G_OP_TYPE_PSROI_POOL;
+extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
@@ -315,3 +315,6 @@ LOAD_OP1(generate_proposals, CPU);
 #ifdef PSROI_POOL_OP
 LOAD_OP1(psroi_pool, CPU);
 #endif
+#ifdef ROI_PERSPECTIVE_OP
+LOAD_OP1(roi_perspective_transform, CPU);
+#endif
--- a/src/operators/detection_ops.cpp
+++ b/src/operators/detection_ops.cpp
@@ -64,6 +64,22 @@ void PSRoiPoolOp<DeviceType, T>::InferShape() const {
 }
 #endif
+#ifdef ROI_PERSPECTIVE_OP
+template <typename DeviceType, typename T>
+void RoiPerspectiveOp<DeviceType, T>::InferShape() const {
+  const auto &input_dims = this->param_.input_x_->dims();
+  const auto &rois_dims = this->param_.input_rois_->dims();
+  const int transformed_height = this->param_.transformed_height_;
+  const int transformed_width = this->param_.transformed_width_;
+  std::vector<int64_t> out_dims_v({rois_dims[0],   // num_rois
+                                   input_dims[1],  // channels
+                                   static_cast<int64_t>(transformed_height),
+                                   static_cast<int64_t>(transformed_width)});
+  auto out_dims = framework::make_ddim(out_dims_v);
+  this->param_.output_->Resize(out_dims);
+}
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
@@ -78,4 +94,7 @@ REGISTER_OPERATOR_CPU(generate_proposals, ops::ProposalOp);
 #ifdef PSROI_POOL_OP
 REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp);
 #endif
+#ifdef ROI_PERSPECTIVE_OP
+REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp);
+#endif
 #endif
--- a/src/operators/detection_ops.h
+++ b/src/operators/detection_ops.h
@@ -34,5 +34,9 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
 DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel);
 #endif
+#ifdef ROI_PERSPECTIVE_OP
+DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel);
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/kernel/arm/resize_kernel.cpp
+++ b/src/operators/kernel/arm/resize_kernel.cpp
@@ -22,8 +22,8 @@ namespace operators {
 void BiLinearResizeTensor(const float* src, const int src_height,
                          const int src_width, float* dst, const int dst_height,
                          const int dst_width) {
-  const float scale_w = src_width / (float)dst_width;
+  const float scale_w = src_width / static_cast<float>(dst_width);
-  const float scale_h = src_height / (float)dst_height;
+  const float scale_h = src_height / static_cast<float>(dst_height);
  float* dst_data = dst;
  const float* src_data = src;
@@ -33,8 +33,8 @@ void BiLinearResizeTensor(const float* src, const int src_height,
    int src_h = std::floor(fh);
    fh -= src_h;
-    const float w_h0 = std::abs((float)1.0 - fh);
+    const float w_h0 = fabs(1.0 - fh);
-    const float w_h1 = std::abs(fh);
+    const float w_h1 = fabs(fh);
    const int dst_offset_1 = dst_h * dst_width;
    const int src_offset_1 = src_h * src_width;
@@ -45,8 +45,8 @@ void BiLinearResizeTensor(const float* src, const int src_height,
      float fw = dst_w * scale_w;
      int src_w = std::floor(fw);
      fw -= src_w;
-      const float w_w0 = std::abs((float)1.0 - fw);
+      const float w_w0 = fabs(1.0 - fw);
-      const float w_w1 = std::abs(fw);
+      const float w_w1 = fabs(fw);
      float dst_value = 0;

--- a/src/operators/kernel/arm/roi_perspective_kernel.cpp
+++ b/src/operators/kernel/arm/roi_perspective_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ROI_PERSPECTIVE_OP
+#include <cmath>
+#include "operators/kernel/detection_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename T>
+inline bool GT_E(T a, T b) {
+  return (a > b) || fabs(a - b) < 1e-4;
+}
+template <typename T>
+inline bool LT_E(T a, T b) {
+  return (a < b) || fabs(a - b) < 1e-4;
+}
+// check if (x, y) is in the boundary of roi
+template <typename T>
+bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
+  for (int i = 0; i < 4; i++) {
+    T xs = roi_x[i];
+    T ys = roi_y[i];
+    T xe = roi_x[(i + 1) % 4];
+    T ye = roi_y[(i + 1) % 4];
+    if (fabs(ys - ye) < 1e-4) {
+      if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
+          GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
+        return true;
+      }
+    } else {
+      T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
+      if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
+          LT_E<T>(y, std::max(ys, ye))) {
+        return true;
+      }
+    }
+  }
+  int n_cross = 0;
+  for (int i = 0; i < 4; i++) {
+    T xs = roi_x[i];
+    T ys = roi_y[i];
+    T xe = roi_x[(i + 1) % 4];
+    T ye = roi_y[(i + 1) % 4];
+    if (fabs(ys - ye) < 1e-4) {
+      continue;
+    }
+    if (LT_E<T>(y, std::min(ys, ye)) || (y > std::max(ys, ye))) {
+      continue;
+    }
+    T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
+    if (fabs(intersec_x - x) < 1e-4) {
+      return true;
+    }
+    if (intersec_x > x) {
+      n_cross++;
+    }
+  }
+  return (n_cross % 2 == 1);
+}
+template <typename T>
+void get_transform_matrix(const int transformed_width,
+                          const int transformed_height, T roi_x[], T roi_y[],
+                          T matrix[]) {
+  T x0 = roi_x[0];
+  T x1 = roi_x[1];
+  T x2 = roi_x[2];
+  T x3 = roi_x[3];
+  T y0 = roi_y[0];
+  T y1 = roi_y[1];
+  T y2 = roi_y[2];
+  T y3 = roi_y[3];
+  // Estimate the height and width of RoI
+  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
+  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
+  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
+  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
+  T estimated_height = (len2 + len4) / 2.0;
+  T estimated_width = (len1 + len3) / 2.0;
+  // Get the normalized height and normalized width
+  int normalized_height = transformed_height;
+  int normalized_width =
+      std::round(estimated_width * (normalized_height - 1) / estimated_height) +
+      1;
+  normalized_width = std::min(normalized_width, transformed_width);
+  T dx1 = x1 - x2;
+  T dx2 = x3 - x2;
+  T dx3 = x0 - x1 + x2 - x3;
+  T dy1 = y1 - y2;
+  T dy2 = y3 - y2;
+  T dy3 = y0 - y1 + y2 - y3;
+  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
+              (normalized_width - 1);
+  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
+              (normalized_height - 1);
+  matrix[8] = 1;
+  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
+              (normalized_width - 1);
+  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
+              (normalized_height - 1);
+  matrix[5] = y0;
+  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
+              (normalized_width - 1);
+  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
+              (normalized_height - 1);
+  matrix[2] = x0;
+}
+// Get the source coordinates in the input feature map.
+// (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
+// in_w = u / w
+// in_h = v / w
+template <typename T>
+void get_source_coords(T matrix[], int out_w, int out_h, T *in_w, T *in_h) {
+  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
+  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
+  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
+  in_w[0] = u / w;
+  in_h[0] = v / w;
+}
+template <typename T>
+void bilinear_interpolate(const T *in_data, const int channels, const int width,
+                          const int height, int in_n, int in_c, T in_w, T in_h,
+                          T *val) {
+  // Deal with cases that source coords are out of feature map boundary
+  if ((-0.5 > in_w) || (in_w > width - 0.5) || (-0.5 > in_h) ||
+      (in_h > height - 0.5)) {
+    // empty
+    val[0] = 0.0;
+    return;
+  }
+  if (in_w < 0) {
+    in_w = 0;
+  }
+  if (in_h < 0) {
+    in_h = 0;
+  }
+  int in_w_floor = floor(in_w);
+  int in_h_floor = floor(in_h);
+  int in_w_ceil;
+  int in_h_ceil;
+  if (GT_E<T>(in_w_floor, width - 1)) {
+    in_w_ceil = in_w_floor = width - 1;
+    in_w = static_cast<T>(in_w_floor);
+  } else {
+    in_w_ceil = in_w_floor + 1;
+  }
+  if (GT_E<T>(in_h_floor, height - 1)) {
+    in_h_ceil = in_h_floor = height - 1;
+    in_h = static_cast<T>(in_h_floor);
+  } else {
+    in_h_ceil = in_h_floor + 1;
+  }
+  T w_floor = in_w - in_w_floor;
+  T h_floor = in_h - in_h_floor;
+  T w_ceil = 1 - w_floor;
+  T h_ceil = 1 - h_floor;
+  const T *data = in_data + (in_n * channels + in_c) * height * width;
+  // Do bilinear interpolation
+  T v1 = data[in_h_floor * width + in_w_floor];
+  T v2 = data[in_h_ceil * width + in_w_floor];
+  T v3 = data[in_h_ceil * width + in_w_ceil];
+  T v4 = data[in_h_floor * width + in_w_ceil];
+  T w1 = w_ceil * h_ceil;
+  T w2 = w_ceil * h_floor;
+  T w3 = w_floor * h_floor;
+  T w4 = w_floor * h_ceil;
+  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+}
+template <>
+bool RoiPerspectiveKernel<CPU, float>::Init(RoiPerspectiveParam<CPU> *param) {
+  return true;
+}
+template <>
+void RoiPerspectiveKernel<CPU, float>::Compute(
+    const RoiPerspectiveParam<CPU> &param) {
+  const auto *input_x = param.input_x_;
+  const auto *input_rois = param.input_rois_;
+  auto *output = param.output_;
+  const auto &in_dims = input_x->dims();
+  const int channels = in_dims[1];
+  const int in_height = in_dims[2];
+  const int in_width = in_dims[3];
+  const int rois_num = input_rois->dims()[0];
+  const int transformed_height = param.transformed_height_;
+  const int transformed_width = param.transformed_width_;
+  const float spatial_scale = param.spatial_scale_;
+  const float *input_data = input_x->data<float>();
+  const float *rois_data = input_rois->data<float>();
+  float *output_data = output->mutable_data<float>();
+  std::vector<int> roi2image(rois_num);
+  const auto &lod = input_rois->lod().back();
+  for (size_t i = 0; i < lod.size() - 1; ++i) {
+    for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
+      roi2image[j] = i;
+    }
+  }
+  for (int n = 0; n < rois_num; ++n) {
+    const float *n_rois = rois_data + n * 8;
+    float roi_x[4];
+    float roi_y[4];
+    for (int k = 0; k < 4; ++k) {
+      roi_x[k] = n_rois[2 * k] * spatial_scale;
+      roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
+    }
+    int image_id = roi2image[n];
+    // Get transform matrix
+    float transform_matrix[9];
+    get_transform_matrix<float>(transformed_width, transformed_height, roi_x,
+                                roi_y, transform_matrix);
+    for (int c = 0; c < channels; ++c) {
+      for (int out_h = 0; out_h < transformed_height; ++out_h) {
+        for (int out_w = 0; out_w < transformed_width; ++out_w) {
+          int out_index =
+              n * channels * transformed_height * transformed_width +
+              c * transformed_height * transformed_width +
+              out_h * transformed_width + out_w;
+          float in_w, in_h;
+          get_source_coords<float>(transform_matrix, out_w, out_h, &in_w,
+                                   &in_h);
+          if (in_quad<float>(in_w, in_h, roi_x, roi_y)) {
+            if ((-0.5 > in_w) || (in_w > (in_width - 0.5)) || (-0.5 > in_h) ||
+                (in_h > (in_height - 0.5))) {
+              output_data[out_index] = 0.0;
+            } else {
+              bilinear_interpolate<float>(input_data, channels, in_width,
+                                          in_height, image_id, c, in_w, in_h,
+                                          output_data + out_index);
+            }
+          } else {
+            output_data[out_index] = 0.0;
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // ROI_PERSPECTIVE_OP
--- a/src/operators/kernel/detection_kernel.h
+++ b/src/operators/kernel/detection_kernel.h
@@ -136,5 +136,35 @@ class PSRoiPoolParam : public OpParam {
 DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
 #endif
+#ifdef ROI_PERSPECTIVE_OP
+template <typename Dtype>
+class RoiPerspectiveParam : public OpParam {
+ public:
+  RoiPerspectiveParam(const VariableNameMap &inputs,
+                      const VariableNameMap &outputs, const AttributeMap &attrs,
+                      const Scope &scope) {
+    input_x_ = OpParam::GetVarValue<framework::Tensor>("X", inputs, scope);
+    input_rois_ =
+        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, scope);
+    output_ = OpParam::GetVarValue<framework::Tensor>("Out", outputs, scope);
+    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
+    transformed_height_ = OpParam::GetAttr<int>("transformed_height", attrs);
+    transformed_width_ = OpParam::GetAttr<int>("transformed_width", attrs);
+  }
+ public:
+  framework::Tensor *input_x_;
+  framework::LoDTensor *input_rois_;
+  framework::Tensor *output_;
+  float spatial_scale_;
+  int transformed_height_;
+  int transformed_width_;
+};
+DECLARE_KERNEL(RoiPerspective, RoiPerspectiveParam);
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -291,6 +291,7 @@ if(NOT FOUND_MATCH)
  set(ANCHOR_GENERATOR_OP ON)
  set(PROPOSAL_OP ON)
  set(PSROI_POOL_OP ON)
+  set(ROI_PERSPECTIVE_OP ON)
 endif()
  # option(BATCHNORM_OP "" ON)
@@ -585,3 +586,6 @@ endif()
 if (PSROI_POOL_OP)
  add_definitions(-DPSROI_POOL_OP)
 endif()
+if (ROI_PERSPECTIVE_OP)
+  add_definitions(-DROI_PERSPECTIVE_OP)
+endif()