From 292f52ccb17428b92ed61d74e5588648b7ff196d Mon Sep 17 00:00:00 2001
From: yongqiangma <xing.wo@163.com>
Date: Wed, 19 Aug 2020 12:36:49 +0800
Subject: [PATCH] change box_coder. test=develop (#4107)

---
 lite/backends/arm/math/box_coder.cc   | 267 ++++++++++++++++++++------
 lite/backends/arm/math/box_coder.h    |  15 +-
 lite/kernels/arm/box_coder_compute.cc | 193 ++-----------------
 3 files changed, 228 insertions(+), 247 deletions(-)
diff --git a/lite/backends/arm/math/box_coder.cc b/lite/backends/arm/math/box_coder.cc
index 7cb904a8ee..ea9187cec8 100644
--- a/lite/backends/arm/math/box_coder.cc
+++ b/lite/backends/arm/math/box_coder.cc
@@ -20,69 +20,214 @@ namespace lite {
 namespace arm {
 namespace math {
 
-void box_coder(lite::Tensor* proposals,
-               const lite::Tensor* anchors,
-               const lite::Tensor* variances,
-               const lite::Tensor* bbox_deltas,
-               const std::string code_type,
-               bool box_normalized,
-               int axis) {
-  if (code_type == "decode_center_size") {
-    float normalized = !box_normalized ? 1.f : 0;
-
-    const float* anchor_data = anchors->data<float>();
-    const float* bbox_deltas_data = bbox_deltas->data<float>();
-    float* proposals_data = proposals->mutable_data<float>();
-    const float* variances_data = variances->data<float>();
-
-    int N = bbox_deltas->dims()[0];
-    int M = bbox_deltas->dims()[1];
-    int len = bbox_deltas->dims()[2];
-
-    for (int64_t row_id = 0; row_id < N; ++row_id) {
-      for (int64_t col_id = 0; col_id < M; ++col_id) {
-        size_t offset = row_id * M * len + col_id * len;
-        int prior_box_offset = axis == 0 ? col_id * len : row_id * len;
-        int var_offset = axis == 0 ? col_id * len : row_id * len;
-
-        auto anchor_data_tmp = anchor_data + prior_box_offset;
-        auto bbox_deltas_data_tmp = bbox_deltas_data + offset;
-        auto proposals_data_tmp = proposals_data + offset;
-
-        auto anchor_width =
-            anchor_data_tmp[2] - anchor_data_tmp[0] + normalized;
-        auto anchor_height =
-            anchor_data_tmp[3] - anchor_data_tmp[1] + normalized;
-        auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width;
-        auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height;
-
-        float bbox_center_x = 0, bbox_center_y = 0;
-        float bbox_width = 0, bbox_height = 0;
-
-        auto variances_data_tmp = variances_data + var_offset;
-
-        bbox_center_x =
-            variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width +
-            anchor_center_x;
-        bbox_center_y =
-            variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height +
-            anchor_center_y;
-        bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) *
-                     anchor_width;
-        bbox_height =
-            std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) *
-            anchor_height;
-
-        proposals_data_tmp[0] = bbox_center_x - bbox_width / 2;
-        proposals_data_tmp[1] = bbox_center_y - bbox_height / 2;
-        proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized;
-        proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized;
-      }
+void decode_bbox_center_variance_kernel(const int batch_num,
+                                        const float* loc_data,
+                                        const float* prior_data,
+                                        const float* variance,
+                                        const int num_priors,
+                                        float* bbox_data) {
+  int cnt = num_priors / 4;
+  //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
+  //! vloc   0: xmin, 1: ymin, 2: xmax, 3: ymax
+  //! vvar
+  float32x4_t vhalf = vdupq_n_f32(0.5f);
+
+  int len_batch = num_priors * 4;
+
+  for (int n = 0; n < batch_num; ++n) {
+    const float* ptr_loc_batch = loc_data + n * len_batch;
+    float* ptr_bbox_batch = bbox_data + n * len_batch;
+
+#pragma omp parallel for
+    for (int i = 0; i < cnt; ++i) {
+      int idx = i * 16;
+      const float* ptr_loc = ptr_loc_batch + idx;
+      const float* ptr_prior = prior_data + idx;
+      float* ptr_bbox = ptr_bbox_batch + idx;
+
+      float32x4x4_t vprior = vld4q_f32(ptr_prior);
+      float32x4x4_t vloc = vld4q_f32(ptr_loc);
+      float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
+      float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
+      float32x4_t vprior_cx =
+          vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf);
+      float32x4_t vprior_cy =
+          vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf);
+
+      float32x4_t vdec_bbx_cx =
+          vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
+      float32x4_t vdec_bbx_cy =
+          vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
+      float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
+      float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
+      vprior_width = vmulq_f32(vprior_width, vhalf);
+      vprior_height = vmulq_f32(vprior_height, vhalf);
+      vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
+      vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);
+
+      vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
+      vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
+      vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w);
+      vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h);
+
+      vst4q_f32(ptr_bbox, vloc);
     }
-  } else if (code_type == "encode_center_size") {
-    LOG(FATAL) << "not implemented type: " << code_type;
+#pragma omp parallel for
+    for (int i = cnt * 4; i < num_priors; i++) {
+      int idx = i * 4;
+      float p_xmin = prior_data[idx];
+      float p_ymin = prior_data[idx + 1];
+      float p_xmax = prior_data[idx + 2];
+      float p_ymax = prior_data[idx + 3];
+      float prior_width = p_xmax - p_xmin;
+      float prior_height = p_ymax - p_ymin;
+      float prior_center_x = (p_xmin + p_xmax) / 2.f;
+      float prior_center_y = (p_ymin + p_ymax) / 2.f;
+
+      float xmin = ptr_loc_batch[idx];
+      float ymin = ptr_loc_batch[idx + 1];
+      float xmax = ptr_loc_batch[idx + 2];
+      float ymax = ptr_loc_batch[idx + 3];
+
+      //! variance is encoded in target, we simply need to retore the offset
+      //! predictions.
+      float decode_bbox_center_x = xmin * prior_width + prior_center_x;
+      float decode_bbox_center_y = ymin * prior_height + prior_center_y;
+      float decode_bbox_width = expf(xmax) * prior_width;
+      float decode_bbox_height = expf(ymax) * prior_height;
+
+      ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
+      ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f;
+      ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f;
+      ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f;
+    }
+  }
+}
+
+void decode_bbox_center_no_variance_kernel(const int batch_num,
+                                           const float* loc_data,
+                                           const float* prior_data,
+                                           const float* variance,
+                                           const int num_priors,
+                                           const bool normalized,
+                                           float* bbox_data) {
+  int cnt = num_priors / 4;
+  //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
+  //! vloc   0: xmin, 1: ymin, 2: xmax, 3: ymax
+  //! vvar
+  float32x4_t vhalf = vdupq_n_f32(0.5f);
+  float norm_value = (normalized == false);
+  float32x4_t vnormalized = vdupq_n_f32(norm_value);
+  int len_batch = num_priors * 4;
+
+  for (int n = 0; n < batch_num; ++n) {
+    const float* ptr_loc_batch = loc_data + n * len_batch;
+    float* ptr_bbox_batch = bbox_data + n * len_batch;
+
+#pragma omp parallel for
+    for (int i = 0; i < cnt; ++i) {
+      int idx = i * 16;
+
+      const float* ptr_loc = ptr_loc_batch + idx;
+      const float* ptr_prior = prior_data + idx;
+      const float* ptr_var = variance + idx;
+      float* ptr_bbox = ptr_bbox_batch + idx;
+
+      float32x4x4_t vprior = vld4q_f32(ptr_prior);
+      float32x4x4_t vloc = vld4q_f32(ptr_loc);
+      float32x4x4_t vvar = vld4q_f32(ptr_var);
+      float32x4_t vprior_width1 = vsubq_f32(vprior.val[2], vprior.val[0]);
+      float32x4_t vprior_height1 = vsubq_f32(vprior.val[3], vprior.val[1]);
+      float32x4_t vprior_width = vaddq_f32(vprior_width1, vnormalized);
+      float32x4_t vprior_height = vaddq_f32(vprior_height1, vnormalized);
+
+      float32x4_t vprior_cx =
+          vaddq_f32(vprior.val[0], vmulq_f32(vprior_width, vhalf));
+      float32x4_t vprior_cy =
+          vaddq_f32(vprior.val[1], vmulq_f32(vprior_height, vhalf));
+
+      vloc.val[0] = vmulq_f32(vloc.val[0], vvar.val[0]);
+      vloc.val[1] = vmulq_f32(vloc.val[1], vvar.val[1]);
+      vloc.val[2] = vmulq_f32(vloc.val[2], vvar.val[2]);
+      vloc.val[3] = vmulq_f32(vloc.val[3], vvar.val[3]);
+
+      float32x4_t vdec_bbx_cx =
+          vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
+      float32x4_t vdec_bbx_cy =
+          vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
+      float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
+      float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
+      vprior_width = vmulq_f32(vprior_width, vhalf);
+      vprior_height = vmulq_f32(vprior_height, vhalf);
+      vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
+      vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);
+
+      vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
+      vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
+      vloc.val[2] = vaddq_f32(vdec_bbx_cx, vsubq_f32(vdec_bbx_w, vnormalized));
+      vloc.val[3] = vaddq_f32(vdec_bbx_cy, vsubq_f32(vdec_bbx_h, vnormalized));
+
+      vst4q_f32(ptr_bbox, vloc);
+    }
+
+#pragma omp parallel for
+    for (int i = cnt * 4; i < num_priors; i++) {
+      int idx = i * 4;
+      float p_xmin = prior_data[idx];
+      float p_ymin = prior_data[idx + 1];
+      float p_xmax = prior_data[idx + 2];
+      float p_ymax = prior_data[idx + 3];
+      float prior_width = p_xmax - p_xmin + norm_value;
+      float prior_height = p_ymax - p_ymin + norm_value;
+      float prior_center_x = p_xmin + prior_width / 2.f;
+      float prior_center_y = p_ymin + prior_height / 2.f;
+
+      float xmin = ptr_loc_batch[idx];
+      float ymin = ptr_loc_batch[idx + 1];
+      float xmax = ptr_loc_batch[idx + 2];
+      float ymax = ptr_loc_batch[idx + 3];
+
+      //! variance is encoded in target, we simply need to retore the offset
+      //! predictions.
+      float decode_bbox_center_x =
+          variance[idx] * xmin * prior_width + prior_center_x;
+      float decode_bbox_center_y =
+          variance[idx + 1] * ymin * prior_height + prior_center_y;
+      float decode_bbox_width = expf(variance[idx + 2] * xmax) * prior_width;
+      float decode_bbox_height = expf(variance[idx + 3] * ymax) * prior_height;
+
+      ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
+      ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f;
+      ptr_bbox_batch[idx + 2] =
+          decode_bbox_center_x + decode_bbox_width / 2.f - norm_value;
+      ptr_bbox_batch[idx + 3] =
+          decode_bbox_center_y + decode_bbox_height / 2.f - norm_value;
+    }
+  }
+}
+
+void decode_bboxes(const int batch_num,
+                   const float* loc_data,
+                   const float* prior_data,
+                   const float* variance_data,
+                   const std::string code_type,
+                   const bool normalized,
+                   const int num_priors,
+                   float* bbox_data) {
+  if (code_type == "encode_center_size") {
+    decode_bbox_center_variance_kernel(
+        batch_num, loc_data, prior_data, variance_data, num_priors, bbox_data);
+
+  } else if (code_type == "decode_center_size") {
+    decode_bbox_center_no_variance_kernel(batch_num,
+                                          loc_data,
+                                          prior_data,
+                                          variance_data,
+                                          num_priors,
+                                          normalized,
+                                          bbox_data);
   } else {
-    LOG(FATAL) << "not supported type: " << code_type;
+    LOG(FATAL) << "box_coder don't support this code_type: " << code_type;
   }
 }
 
diff --git a/lite/backends/arm/math/box_coder.h b/lite/backends/arm/math/box_coder.h
index bbeb3e0618..29281dc5c9 100644
--- a/lite/backends/arm/math/box_coder.h
+++ b/lite/backends/arm/math/box_coder.h
@@ -22,13 +22,14 @@ namespace lite {
 namespace arm {
 namespace math {
 
-void box_coder(lite::Tensor* proposals,
-               const lite::Tensor* anchors,
-               const lite::Tensor* variances,
-               const lite::Tensor* bbox_deltas,
-               const std::string code_type,
-               bool box_normalized,
-               int axis);
+void decode_bboxes(const int batch_num,
+                   const float* loc_data,
+                   const float* prior_data,
+                   const float* variance_data,
+                   const std::string code_type,
+                   const bool normalized,
+                   const int num_priors,
+                   float* bbox_data);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/kernels/arm/box_coder_compute.cc b/lite/kernels/arm/box_coder_compute.cc
index 81e79a83f2..8b0c38c97c 100644
--- a/lite/kernels/arm/box_coder_compute.cc
+++ b/lite/kernels/arm/box_coder_compute.cc
@@ -22,156 +22,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void EncodeCenterSize(const Tensor* target_box,
-                      const Tensor* prior_box,
-                      const Tensor* prior_box_var,
-                      const bool normalized,
-                      const std::vector<float> variance,
-                      float* output) {
-  int64_t row = target_box->dims()[0];
-  int64_t col = prior_box->dims()[0];
-  int64_t len = prior_box->dims()[1];
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      auto* target_box_data = target_box->data<float>();
-      auto* prior_box_data = prior_box->data<float>();
-      int64_t offset = i * col * len + j * len;
-      float prior_box_width = prior_box_data[j * len + 2] -
-                              prior_box_data[j * len] + (normalized == false);
-      float prior_box_height = prior_box_data[j * len + 3] -
-                               prior_box_data[j * len + 1] +
-                               (normalized == false);
-      float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
-      float prior_box_center_y =
-          prior_box_data[j * len + 1] + prior_box_height / 2;
-
-      float target_box_center_x =
-          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-      float target_box_center_y =
-          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-      float target_box_width = target_box_data[i * len + 2] -
-                               target_box_data[i * len] + (normalized == false);
-      float target_box_height = target_box_data[i * len + 3] -
-                                target_box_data[i * len + 1] +
-                                (normalized == false);
-
-      output[offset] =
-          (target_box_center_x - prior_box_center_x) / prior_box_width;
-      output[offset + 1] =
-          (target_box_center_y - prior_box_center_y) / prior_box_height;
-      output[offset + 2] =
-          std::log(std::fabs(target_box_width / prior_box_width));
-      output[offset + 3] =
-          std::log(std::fabs(target_box_height / prior_box_height));
-    }
-  }
-
-  if (prior_box_var) {
-    const float* prior_box_var_data = prior_box_var->data<float>();
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        for (int k = 0; k < 4; ++k) {
-          int64_t offset = i * col * len + j * len;
-          int64_t prior_var_offset = j * len;
-          output[offset + k] /= prior_box_var_data[prior_var_offset + k];
-        }
-      }
-    }
-  } else if (!(variance.empty())) {
-    for (int64_t i = 0; i < row; ++i) {
-      for (int64_t j = 0; j < col; ++j) {
-        for (int k = 0; k < 4; ++k) {
-          int64_t offset = i * col * len + j * len;
-          output[offset + k] /= static_cast<float>(variance[k]);
-        }
-      }
-    }
-  }
-}
-
-template <int axis, int var_size>
-void DecodeCenterSize(const Tensor* target_box,
-                      const Tensor* prior_box,
-                      const Tensor* prior_box_var,
-                      const bool normalized,
-                      std::vector<float> variance,
-                      float* output) {
-  int64_t row = target_box->dims()[0];
-  int64_t col = target_box->dims()[1];
-  int64_t len = target_box->dims()[2];
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      auto* target_box_data = target_box->data<float>();
-      auto* prior_box_data = prior_box->data<float>();
-
-      float var_data[4] = {1., 1., 1., 1.};
-      float* var_ptr = var_data;
-      int64_t offset = i * col * len + j * len;
-      int64_t prior_box_offset = axis == 0 ? j * len : i * len;
-
-      float prior_box_width = prior_box_data[prior_box_offset + 2] -
-                              prior_box_data[prior_box_offset] +
-                              (normalized == false);
-      float prior_box_height = prior_box_data[prior_box_offset + 3] -
-                               prior_box_data[prior_box_offset + 1] +
-                               (normalized == false);
-      float prior_box_center_x =
-          prior_box_data[prior_box_offset] + prior_box_width / 2;
-      float prior_box_center_y =
-          prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
-
-      float target_box_center_x = 0, target_box_center_y = 0;
-      float target_box_width = 0, target_box_height = 0;
-      int64_t prior_var_offset = axis == 0 ? j * len : i * len;
-      if (var_size == 2) {
-        std::memcpy(var_ptr,
-                    prior_box_var->data<float>() + prior_var_offset,
-                    4 * sizeof(float));
-      } else if (var_size == 1) {
-        var_ptr = reinterpret_cast<float*>(variance.data());
-      }
-      float box_var_x = *var_ptr;
-      float box_var_y = *(var_ptr + 1);
-      float box_var_w = *(var_ptr + 2);
-      float box_var_h = *(var_ptr + 3);
-
-      target_box_center_x =
-          box_var_x * target_box_data[offset] * prior_box_width +
-          prior_box_center_x;
-      target_box_center_y =
-          box_var_y * target_box_data[offset + 1] * prior_box_height +
-          prior_box_center_y;
-      target_box_width =
-          std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
-      target_box_height =
-          std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
-
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] =
-          target_box_center_x + target_box_width / 2 - (normalized == false);
-      output[offset + 3] =
-          target_box_center_y + target_box_height / 2 - (normalized == false);
-    }
-  }
-}
-
 void BoxCoderCompute::Run() {
-  /*
-  auto& param = Param<operators::BoxCoderParam>();
-  int axis = param.axis;
-  bool box_normalized = param.box_normalized;
-  std::string code_type = param.code_type;
-
-  lite::arm::math::box_coder(param.proposals,
-                             param.prior_box,
-                             param.prior_box_var,
-                             param.target_box,
-                             code_type,
-                             box_normalized,
-                             axis);
-  */
   auto& param = Param<operators::BoxCoderParam>();
   auto* prior_box = param.prior_box;
   auto* prior_box_var = param.prior_box_var;
@@ -191,36 +42,20 @@ void BoxCoderCompute::Run() {
   output_box->Resize({row, col, len});
   auto* output = output_box->mutable_data<float>();
 
-  if (code_type == "encode_center_size") {
-    EncodeCenterSize(
-        target_box, prior_box, prior_box_var, normalized, variance, output);
-  } else if (code_type == "decode_center_size") {
-    if (prior_box_var) {
-      if (axis == 0) {
-        DecodeCenterSize<0, 2>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      } else {
-        DecodeCenterSize<1, 2>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      }
-    } else if (!(variance.empty())) {
-      if (axis == 0) {
-        DecodeCenterSize<0, 1>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      } else {
-        DecodeCenterSize<1, 1>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      }
-    } else {
-      if (axis == 0) {
-        DecodeCenterSize<0, 0>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      } else {
-        DecodeCenterSize<1, 0>(
-            target_box, prior_box, prior_box_var, normalized, variance, output);
-      }
-    }
-  }
+  int num = target_box->dims()[0];
+  const float* loc_data = target_box->data<float>();
+  const float* prior_data = prior_box->data<float>();
+  const float* variance_data = prior_box_var->data<float>();
+  int _num_priors = prior_box->numel() / 4;
+  bool _share_location = true;
+  lite::arm::math::decode_bboxes(num,
+                                 loc_data,
+                                 prior_data,
+                                 variance_data,
+                                 code_type,
+                                 normalized,
+                                 _num_priors,
+                                 output);
 }
 
 }  // namespace arm
-- 
GitLab