From 292f52ccb17428b92ed61d74e5588648b7ff196d Mon Sep 17 00:00:00 2001 From: yongqiangma Date: Wed, 19 Aug 2020 12:36:49 +0800 Subject: [PATCH] change box_coder. test=develop (#4107) --- lite/backends/arm/math/box_coder.cc | 267 ++++++++++++++++++++------ lite/backends/arm/math/box_coder.h | 15 +- lite/kernels/arm/box_coder_compute.cc | 193 ++----------------- 3 files changed, 228 insertions(+), 247 deletions(-) diff --git a/lite/backends/arm/math/box_coder.cc b/lite/backends/arm/math/box_coder.cc index 7cb904a8ee..ea9187cec8 100644 --- a/lite/backends/arm/math/box_coder.cc +++ b/lite/backends/arm/math/box_coder.cc @@ -20,69 +20,214 @@ namespace lite { namespace arm { namespace math { -void box_coder(lite::Tensor* proposals, - const lite::Tensor* anchors, - const lite::Tensor* variances, - const lite::Tensor* bbox_deltas, - const std::string code_type, - bool box_normalized, - int axis) { - if (code_type == "decode_center_size") { - float normalized = !box_normalized ? 1.f : 0; - - const float* anchor_data = anchors->data(); - const float* bbox_deltas_data = bbox_deltas->data(); - float* proposals_data = proposals->mutable_data(); - const float* variances_data = variances->data(); - - int N = bbox_deltas->dims()[0]; - int M = bbox_deltas->dims()[1]; - int len = bbox_deltas->dims()[2]; - - for (int64_t row_id = 0; row_id < N; ++row_id) { - for (int64_t col_id = 0; col_id < M; ++col_id) { - size_t offset = row_id * M * len + col_id * len; - int prior_box_offset = axis == 0 ? col_id * len : row_id * len; - int var_offset = axis == 0 ? col_id * len : row_id * len; - - auto anchor_data_tmp = anchor_data + prior_box_offset; - auto bbox_deltas_data_tmp = bbox_deltas_data + offset; - auto proposals_data_tmp = proposals_data + offset; - - auto anchor_width = - anchor_data_tmp[2] - anchor_data_tmp[0] + normalized; - auto anchor_height = - anchor_data_tmp[3] - anchor_data_tmp[1] + normalized; - auto anchor_center_x = anchor_data_tmp[0] + 0.5 * anchor_width; - auto anchor_center_y = anchor_data_tmp[1] + 0.5 * anchor_height; - - float bbox_center_x = 0, bbox_center_y = 0; - float bbox_width = 0, bbox_height = 0; - - auto variances_data_tmp = variances_data + var_offset; - - bbox_center_x = - variances_data_tmp[0] * bbox_deltas_data_tmp[0] * anchor_width + - anchor_center_x; - bbox_center_y = - variances_data_tmp[1] * bbox_deltas_data_tmp[1] * anchor_height + - anchor_center_y; - bbox_width = std::exp(variances_data_tmp[2] * bbox_deltas_data_tmp[2]) * - anchor_width; - bbox_height = - std::exp(variances_data_tmp[3] * bbox_deltas_data_tmp[3]) * - anchor_height; - - proposals_data_tmp[0] = bbox_center_x - bbox_width / 2; - proposals_data_tmp[1] = bbox_center_y - bbox_height / 2; - proposals_data_tmp[2] = bbox_center_x + bbox_width / 2 - normalized; - proposals_data_tmp[3] = bbox_center_y + bbox_height / 2 - normalized; - } +void decode_bbox_center_variance_kernel(const int batch_num, + const float* loc_data, + const float* prior_data, + const float* variance, + const int num_priors, + float* bbox_data) { + int cnt = num_priors / 4; + //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vvar + float32x4_t vhalf = vdupq_n_f32(0.5f); + + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + const float* ptr_loc_batch = loc_data + n * len_batch; + float* ptr_bbox_batch = bbox_data + n * len_batch; + +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + int idx = i * 16; + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4x4_t vprior = vld4q_f32(ptr_prior); + float32x4x4_t vloc = vld4q_f32(ptr_loc); + float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]); + float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]); + float32x4_t vprior_cx = + vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf); + float32x4_t vprior_cy = + vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf); + + float32x4_t vdec_bbx_cx = + vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); + float32x4_t vdec_bbx_cy = + vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); + float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); + float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); + vprior_width = vmulq_f32(vprior_width, vhalf); + vprior_height = vmulq_f32(vprior_height, vhalf); + vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); + vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); + + vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); + vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); + vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w); + vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h); + + vst4q_f32(ptr_bbox, vloc); } - } else if (code_type == "encode_center_size") { - LOG(FATAL) << "not implemented type: " << code_type; +#pragma omp parallel for + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float p_xmin = prior_data[idx]; + float p_ymin = prior_data[idx + 1]; + float p_xmax = prior_data[idx + 2]; + float p_ymax = prior_data[idx + 3]; + float prior_width = p_xmax - p_xmin; + float prior_height = p_ymax - p_ymin; + float prior_center_x = (p_xmin + p_xmax) / 2.f; + float prior_center_y = (p_ymin + p_ymax) / 2.f; + + float xmin = ptr_loc_batch[idx]; + float ymin = ptr_loc_batch[idx + 1]; + float xmax = ptr_loc_batch[idx + 2]; + float ymax = ptr_loc_batch[idx + 3]; + + //! variance is encoded in target, we simply need to retore the offset + //! predictions. + float decode_bbox_center_x = xmin * prior_width + prior_center_x; + float decode_bbox_center_y = ymin * prior_height + prior_center_y; + float decode_bbox_width = expf(xmax) * prior_width; + float decode_bbox_height = expf(ymax) * prior_height; + + ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; + ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; + ptr_bbox_batch[idx + 2] = decode_bbox_center_x + decode_bbox_width / 2.f; + ptr_bbox_batch[idx + 3] = decode_bbox_center_y + decode_bbox_height / 2.f; + } + } +} + +void decode_bbox_center_no_variance_kernel(const int batch_num, + const float* loc_data, + const float* prior_data, + const float* variance, + const int num_priors, + const bool normalized, + float* bbox_data) { + int cnt = num_priors / 4; + //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vloc 0: xmin, 1: ymin, 2: xmax, 3: ymax + //! vvar + float32x4_t vhalf = vdupq_n_f32(0.5f); + float norm_value = (normalized == false); + float32x4_t vnormalized = vdupq_n_f32(norm_value); + int len_batch = num_priors * 4; + + for (int n = 0; n < batch_num; ++n) { + const float* ptr_loc_batch = loc_data + n * len_batch; + float* ptr_bbox_batch = bbox_data + n * len_batch; + +#pragma omp parallel for + for (int i = 0; i < cnt; ++i) { + int idx = i * 16; + + const float* ptr_loc = ptr_loc_batch + idx; + const float* ptr_prior = prior_data + idx; + const float* ptr_var = variance + idx; + float* ptr_bbox = ptr_bbox_batch + idx; + + float32x4x4_t vprior = vld4q_f32(ptr_prior); + float32x4x4_t vloc = vld4q_f32(ptr_loc); + float32x4x4_t vvar = vld4q_f32(ptr_var); + float32x4_t vprior_width1 = vsubq_f32(vprior.val[2], vprior.val[0]); + float32x4_t vprior_height1 = vsubq_f32(vprior.val[3], vprior.val[1]); + float32x4_t vprior_width = vaddq_f32(vprior_width1, vnormalized); + float32x4_t vprior_height = vaddq_f32(vprior_height1, vnormalized); + + float32x4_t vprior_cx = + vaddq_f32(vprior.val[0], vmulq_f32(vprior_width, vhalf)); + float32x4_t vprior_cy = + vaddq_f32(vprior.val[1], vmulq_f32(vprior_height, vhalf)); + + vloc.val[0] = vmulq_f32(vloc.val[0], vvar.val[0]); + vloc.val[1] = vmulq_f32(vloc.val[1], vvar.val[1]); + vloc.val[2] = vmulq_f32(vloc.val[2], vvar.val[2]); + vloc.val[3] = vmulq_f32(vloc.val[3], vvar.val[3]); + + float32x4_t vdec_bbx_cx = + vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx); + float32x4_t vdec_bbx_cy = + vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy); + float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]); + float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]); + vprior_width = vmulq_f32(vprior_width, vhalf); + vprior_height = vmulq_f32(vprior_height, vhalf); + vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width); + vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height); + + vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w); + vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h); + vloc.val[2] = vaddq_f32(vdec_bbx_cx, vsubq_f32(vdec_bbx_w, vnormalized)); + vloc.val[3] = vaddq_f32(vdec_bbx_cy, vsubq_f32(vdec_bbx_h, vnormalized)); + + vst4q_f32(ptr_bbox, vloc); + } + +#pragma omp parallel for + for (int i = cnt * 4; i < num_priors; i++) { + int idx = i * 4; + float p_xmin = prior_data[idx]; + float p_ymin = prior_data[idx + 1]; + float p_xmax = prior_data[idx + 2]; + float p_ymax = prior_data[idx + 3]; + float prior_width = p_xmax - p_xmin + norm_value; + float prior_height = p_ymax - p_ymin + norm_value; + float prior_center_x = p_xmin + prior_width / 2.f; + float prior_center_y = p_ymin + prior_height / 2.f; + + float xmin = ptr_loc_batch[idx]; + float ymin = ptr_loc_batch[idx + 1]; + float xmax = ptr_loc_batch[idx + 2]; + float ymax = ptr_loc_batch[idx + 3]; + + //! variance is encoded in target, we simply need to retore the offset + //! predictions. + float decode_bbox_center_x = + variance[idx] * xmin * prior_width + prior_center_x; + float decode_bbox_center_y = + variance[idx + 1] * ymin * prior_height + prior_center_y; + float decode_bbox_width = expf(variance[idx + 2] * xmax) * prior_width; + float decode_bbox_height = expf(variance[idx + 3] * ymax) * prior_height; + + ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f; + ptr_bbox_batch[idx + 1] = decode_bbox_center_y - decode_bbox_height / 2.f; + ptr_bbox_batch[idx + 2] = + decode_bbox_center_x + decode_bbox_width / 2.f - norm_value; + ptr_bbox_batch[idx + 3] = + decode_bbox_center_y + decode_bbox_height / 2.f - norm_value; + } + } +} + +void decode_bboxes(const int batch_num, + const float* loc_data, + const float* prior_data, + const float* variance_data, + const std::string code_type, + const bool normalized, + const int num_priors, + float* bbox_data) { + if (code_type == "encode_center_size") { + decode_bbox_center_variance_kernel( + batch_num, loc_data, prior_data, variance_data, num_priors, bbox_data); + + } else if (code_type == "decode_center_size") { + decode_bbox_center_no_variance_kernel(batch_num, + loc_data, + prior_data, + variance_data, + num_priors, + normalized, + bbox_data); } else { - LOG(FATAL) << "not supported type: " << code_type; + LOG(FATAL) << "box_coder don't support this code_type: " << code_type; } } diff --git a/lite/backends/arm/math/box_coder.h b/lite/backends/arm/math/box_coder.h index bbeb3e0618..29281dc5c9 100644 --- a/lite/backends/arm/math/box_coder.h +++ b/lite/backends/arm/math/box_coder.h @@ -22,13 +22,14 @@ namespace lite { namespace arm { namespace math { -void box_coder(lite::Tensor* proposals, - const lite::Tensor* anchors, - const lite::Tensor* variances, - const lite::Tensor* bbox_deltas, - const std::string code_type, - bool box_normalized, - int axis); +void decode_bboxes(const int batch_num, + const float* loc_data, + const float* prior_data, + const float* variance_data, + const std::string code_type, + const bool normalized, + const int num_priors, + float* bbox_data); } // namespace math } // namespace arm diff --git a/lite/kernels/arm/box_coder_compute.cc b/lite/kernels/arm/box_coder_compute.cc index 81e79a83f2..8b0c38c97c 100644 --- a/lite/kernels/arm/box_coder_compute.cc +++ b/lite/kernels/arm/box_coder_compute.cc @@ -22,156 +22,7 @@ namespace lite { namespace kernels { namespace arm { -void EncodeCenterSize(const Tensor* target_box, - const Tensor* prior_box, - const Tensor* prior_box_var, - const bool normalized, - const std::vector variance, - float* output) { - int64_t row = target_box->dims()[0]; - int64_t col = prior_box->dims()[0]; - int64_t len = prior_box->dims()[1]; - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - int64_t offset = i * col * len + j * len; - float prior_box_width = prior_box_data[j * len + 2] - - prior_box_data[j * len] + (normalized == false); - float prior_box_height = prior_box_data[j * len + 3] - - prior_box_data[j * len + 1] + - (normalized == false); - float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2; - float prior_box_center_y = - prior_box_data[j * len + 1] + prior_box_height / 2; - - float target_box_center_x = - (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; - float target_box_center_y = - (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; - float target_box_width = target_box_data[i * len + 2] - - target_box_data[i * len] + (normalized == false); - float target_box_height = target_box_data[i * len + 3] - - target_box_data[i * len + 1] + - (normalized == false); - - output[offset] = - (target_box_center_x - prior_box_center_x) / prior_box_width; - output[offset + 1] = - (target_box_center_y - prior_box_center_y) / prior_box_height; - output[offset + 2] = - std::log(std::fabs(target_box_width / prior_box_width)); - output[offset + 3] = - std::log(std::fabs(target_box_height / prior_box_height)); - } - } - - if (prior_box_var) { - const float* prior_box_var_data = prior_box_var->data(); - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - for (int k = 0; k < 4; ++k) { - int64_t offset = i * col * len + j * len; - int64_t prior_var_offset = j * len; - output[offset + k] /= prior_box_var_data[prior_var_offset + k]; - } - } - } - } else if (!(variance.empty())) { - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - for (int k = 0; k < 4; ++k) { - int64_t offset = i * col * len + j * len; - output[offset + k] /= static_cast(variance[k]); - } - } - } - } -} - -template -void DecodeCenterSize(const Tensor* target_box, - const Tensor* prior_box, - const Tensor* prior_box_var, - const bool normalized, - std::vector variance, - float* output) { - int64_t row = target_box->dims()[0]; - int64_t col = target_box->dims()[1]; - int64_t len = target_box->dims()[2]; - - for (int64_t i = 0; i < row; ++i) { - for (int64_t j = 0; j < col; ++j) { - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - - float var_data[4] = {1., 1., 1., 1.}; - float* var_ptr = var_data; - int64_t offset = i * col * len + j * len; - int64_t prior_box_offset = axis == 0 ? j * len : i * len; - - float prior_box_width = prior_box_data[prior_box_offset + 2] - - prior_box_data[prior_box_offset] + - (normalized == false); - float prior_box_height = prior_box_data[prior_box_offset + 3] - - prior_box_data[prior_box_offset + 1] + - (normalized == false); - float prior_box_center_x = - prior_box_data[prior_box_offset] + prior_box_width / 2; - float prior_box_center_y = - prior_box_data[prior_box_offset + 1] + prior_box_height / 2; - - float target_box_center_x = 0, target_box_center_y = 0; - float target_box_width = 0, target_box_height = 0; - int64_t prior_var_offset = axis == 0 ? j * len : i * len; - if (var_size == 2) { - std::memcpy(var_ptr, - prior_box_var->data() + prior_var_offset, - 4 * sizeof(float)); - } else if (var_size == 1) { - var_ptr = reinterpret_cast(variance.data()); - } - float box_var_x = *var_ptr; - float box_var_y = *(var_ptr + 1); - float box_var_w = *(var_ptr + 2); - float box_var_h = *(var_ptr + 3); - - target_box_center_x = - box_var_x * target_box_data[offset] * prior_box_width + - prior_box_center_x; - target_box_center_y = - box_var_y * target_box_data[offset + 1] * prior_box_height + - prior_box_center_y; - target_box_width = - std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width; - target_box_height = - std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height; - - output[offset] = target_box_center_x - target_box_width / 2; - output[offset + 1] = target_box_center_y - target_box_height / 2; - output[offset + 2] = - target_box_center_x + target_box_width / 2 - (normalized == false); - output[offset + 3] = - target_box_center_y + target_box_height / 2 - (normalized == false); - } - } -} - void BoxCoderCompute::Run() { - /* - auto& param = Param(); - int axis = param.axis; - bool box_normalized = param.box_normalized; - std::string code_type = param.code_type; - - lite::arm::math::box_coder(param.proposals, - param.prior_box, - param.prior_box_var, - param.target_box, - code_type, - box_normalized, - axis); - */ auto& param = Param(); auto* prior_box = param.prior_box; auto* prior_box_var = param.prior_box_var; @@ -191,36 +42,20 @@ void BoxCoderCompute::Run() { output_box->Resize({row, col, len}); auto* output = output_box->mutable_data(); - if (code_type == "encode_center_size") { - EncodeCenterSize( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else if (code_type == "decode_center_size") { - if (prior_box_var) { - if (axis == 0) { - DecodeCenterSize<0, 2>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else { - DecodeCenterSize<1, 2>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } - } else if (!(variance.empty())) { - if (axis == 0) { - DecodeCenterSize<0, 1>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else { - DecodeCenterSize<1, 1>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } - } else { - if (axis == 0) { - DecodeCenterSize<0, 0>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } else { - DecodeCenterSize<1, 0>( - target_box, prior_box, prior_box_var, normalized, variance, output); - } - } - } + int num = target_box->dims()[0]; + const float* loc_data = target_box->data(); + const float* prior_data = prior_box->data(); + const float* variance_data = prior_box_var->data(); + int _num_priors = prior_box->numel() / 4; + bool _share_location = true; + lite::arm::math::decode_bboxes(num, + loc_data, + prior_data, + variance_data, + code_type, + normalized, + _num_priors, + output); } } // namespace arm -- GitLab