add roi align x86 kernel

975cd45d · shipengchao · MaxwellDing · d6791276 · 975cd45d · 975cd45d
7 changed file
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -27,7 +27,7 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 #ifdef LITE_WITH_MLU
  fusion::FcFuser fuser(false);
  fuser(graph.get());
-#elif
+#else
  fusion::FcFuser fuser(true);
  fuser(graph.get());
 #endif

--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -854,9 +854,11 @@ void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) {
    for (auto& place : v_places) {
      prec_set.insert(place.precision);
    }
+#ifdef LITE_WITH_MLU
    if (lite::TargetWrapperMlu::UseFirstConv()) {
      prec_set.insert(PRECISION(kInt8));
    }
+#endif
    for (auto& prec : prec_set) {
      v_places.emplace_back(TARGET(kX86), prec, DATALAYOUT(kNHWC));
    }

--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -70,6 +70,7 @@ add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite

 add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} blas)
 add_kernel(yolo_box_compute_x86 X86 basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(roi_align_compute_x86 X86 basic SRCS roi_align_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(interpolate_compute_x86 X86 basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps})

 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
@@ -111,5 +112,6 @@ lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compu
 lite_cc_test(test_leaky_relu_compute_x86 SRCS leaky_relu_compute_test.cc DEPS activation_compute_x86)
 lite_cc_test(test_yolo_box_compute_x86 SRCS yolo_box_compute_test.cc DEPS
  yolo_box_compute_x86)
+# lite_cc_test(test_roi_align_compute_x86 SRCS roi_align_compute_test.cc DEPS roi_align_compute_x86)
 lite_cc_test(test_nearest_interp_comute_x86 SRCS interpolate_compute_test.cc
  DEPS interpolate_compute_x86)
--- a/lite/kernels/x86/roi_align_compute.cc
+++ b/lite/kernels/x86/roi_align_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/roi_align_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+static constexpr int kROISize = 4;
+
+template <class T>
+void PreCalcForBilinearInterpolate(const int height,
+                                   const int width,
+                                   const int pooled_height,
+                                   const int pooled_width,
+                                   const int iy_upper,
+                                   const int ix_upper,
+                                   T roi_ymin,
+                                   T roi_xmin,
+                                   T bin_size_h,
+                                   T bin_size_w,
+                                   int roi_bin_grid_h,
+                                   int roi_bin_grid_w,
+                                   Tensor* pre_pos,
+                                   Tensor* pre_w) {
+  int pre_calc_index = 0;
+  int* pre_pos_data = pre_pos->mutable_data<int>();
+  T* pre_w_data = pre_w->mutable_data<T>();
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        // calculate y of sample points
+        T y = roi_ymin + ph * bin_size_h +
+              static_cast<T>(iy + .5f) * bin_size_h /
+                  static_cast<T>(roi_bin_grid_h);
+        // calculate x of samle points
+        for (int ix = 0; ix < ix_upper; ix++) {
+          T x = roi_xmin + pw * bin_size_w +
+                static_cast<T>(ix + .5f) * bin_size_w /
+                    static_cast<T>(roi_bin_grid_w);
+          // deal with elements out of map
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            for (int i = 0; i < kROISize; ++i) {
+              pre_pos_data[i + pre_calc_index * kROISize] = 0;
+              pre_w_data[i + pre_calc_index * kROISize] = 0;
+            }
+            pre_calc_index += 1;
+            continue;
+          }
+          y = y <= 0 ? 0 : y;
+          x = x <= 0 ? 0 : x;
+
+          int y_low = static_cast<int>(y);
+          int x_low = static_cast<int>(x);
+          int y_high;
+          int x_high;
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = static_cast<T>(y_low);
+          } else {
+            y_high = y_low + 1;
+          }
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = static_cast<T>(x_low);
+          } else {
+            x_high = x_low + 1;
+          }
+          T ly = y - y_low, lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          pre_pos_data[pre_calc_index * kROISize] = y_low * width + x_low;
+          pre_pos_data[pre_calc_index * kROISize + 1] = y_low * width + x_high;
+          pre_pos_data[pre_calc_index * kROISize + 2] = y_high * width + x_low;
+          pre_pos_data[pre_calc_index * kROISize + 3] = y_high * width + x_high;
+          pre_w_data[pre_calc_index * kROISize] = hy * hx;
+          pre_w_data[pre_calc_index * kROISize + 1] = hy * lx;
+          pre_w_data[pre_calc_index * kROISize + 2] = ly * hx;
+          pre_w_data[pre_calc_index * kROISize + 3] = ly * lx;
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void RoiAlignCompute::Run() {
+  auto& param = Param<operators::RoiAlignParam>();
+  auto* in = param.X;
+  auto* rois = param.ROIs;
+  auto* out = param.Out;
+  float spatial_scale = param.spatial_scale;
+  int pooled_height = param.pooled_height;
+  int pooled_width = param.pooled_width;
+  int sampling_ratio = param.sampling_ratio;
+
+  auto in_dims = in->dims();
+  // int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  auto rois_dims = rois->dims();
+  int rois_num = rois_dims[0];
+  auto out_dims = out->dims();
+  if (rois_num == 0) {
+    return;
+  }
+
+  DDim in_stride({static_cast<int>(in_dims[1] * in_dims[2] * in_dims[3]),
+                  static_cast<int>(in_dims[2] * in_dims[3]),
+                  static_cast<int>(in_dims[3]),
+                  1});
+  DDim roi_stride({static_cast<int>(rois_dims[1]), 1});
+  DDim out_stride({static_cast<int>(out_dims[1] * out_dims[2] * out_dims[3]),
+                   static_cast<int>(out_dims[2] * out_dims[3]),
+                   static_cast<int>(out_dims[3]),
+                   1});
+
+  auto* input_data = in->data<float>();
+  Tensor roi_batch_id_list;
+  roi_batch_id_list.Resize({rois_num});
+  int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>();
+
+  auto rois_lod = rois->lod().back();
+  int rois_batch_size = rois_lod.size() - 1;
+  // CHECK_OR_FALSE(rois_batch_size == batch_size);
+  // int rois_num_with_lod = rois_lod[rois_batch_size];
+  // CHECK_OR_FALSE(rois_num_with_lod == rois_num);
+  for (int n = 0; n < rois_batch_size; ++n) {
+    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      roi_batch_id_data[i] = n;
+    }
+  }
+
+  auto* output_data = out->mutable_data<float>();
+  auto* rois_data = rois->data<float>();
+  for (int n = 0; n < rois_num; ++n) {
+    int roi_batch_id = roi_batch_id_data[n];
+    float roi_xmin = rois_data[0] * spatial_scale;
+    float roi_ymin = rois_data[1] * spatial_scale;
+    float roi_xmax = rois_data[2] * spatial_scale;
+    float roi_ymax = rois_data[3] * spatial_scale;
+
+    float roi_width = std::max(roi_xmax - roi_xmin, 1.0f);
+    float roi_height = std::max(roi_ymax - roi_ymin, 1.0f);
+    float bin_size_h = roi_height / pooled_height;
+    float bin_size_w = roi_width / pooled_width;
+    const float* batch_data = input_data + roi_batch_id * in_stride[0];
+
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const float count = roi_bin_grid_h * roi_bin_grid_w;
+    Tensor pre_pos;
+    Tensor pre_w;
+    int pre_size = count * out_stride[1];
+    pre_pos.Resize({pre_size, kROISize});
+    pre_w.Resize({pre_size, kROISize});
+    PreCalcForBilinearInterpolate<float>(height,
+                                         width,
+                                         pooled_height,
+                                         pooled_width,
+                                         roi_bin_grid_h,
+                                         roi_bin_grid_w,
+                                         roi_ymin,
+                                         roi_xmin,
+                                         bin_size_h,
+                                         bin_size_w,
+                                         roi_bin_grid_h,
+                                         roi_bin_grid_w,
+                                         &pre_pos,
+                                         &pre_w);
+
+    const int* pre_pos_data = pre_pos.data<int>();
+    const float* pre_w_data = pre_w.data<float>();
+    for (int c = 0; c < channels; c++) {
+      int pre_calc_index = 0;
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          const int pool_index = ph * pooled_width + pw;
+          float output_val = 0;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              for (int i = 0; i < kROISize; i++) {
+                int pos = pre_pos_data[pre_calc_index * kROISize + i];
+                float w = pre_w_data[pre_calc_index * kROISize + i];
+                output_val += w * batch_data[pos];
+              }
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+          output_data[pool_index] = output_val;
+        }
+      }
+      batch_data += in_stride[1];
+      output_data += out_stride[1];
+    }
+    rois_data += roi_stride[0];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(roi_align,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::RoiAlignCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("ROIs", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/kernels/x86/roi_align_compute.h
+++ b/lite/kernels/x86/roi_align_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/roi_align_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+class RoiAlignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::RoiAlignParam;
+
+  void Run() override;
+
+  virtual ~RoiAlignCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -55,7 +55,7 @@ if(LITE_BUILD_EXTRA)
    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/roi_align_compute_test.cc
+++ b/lite/tests/kernels/roi_align_compute_test.cc
@@ -120,6 +120,13 @@ TEST(RoiAlign, precision) {
  // The unit test for roi_align needs the params,
  // which is obtained by runing model by paddle.
  LOG(INFO) << "test roi align op";
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  std::unique_ptr<arena::TestCase> tester(
+      new RoiAlignComputeTester(place, "def"));
+  arena::Arena arena(std::move(tester), place, 2e-4);
+  arena.TestPrecision();
+#endif
 #ifdef LITE_WITH_ARM
  Place place(TARGET(kARM));
  std::unique_ptr<arena::TestCase> tester(