From decdbed054dc649abbc78e0a3405d67321a10315 Mon Sep 17 00:00:00 2001 From: guomingz Date: Wed, 13 Mar 2019 20:51:26 +0800 Subject: [PATCH] resolve #15618 (#16114) * resolve #15618 Backgroud: the PR #15398 raised the box_coder op performance regression, we optimized the code via the more efficency leveraging opemmp. --- .../fluid/operators/detection/box_coder_op.h | 89 +++++++++++-------- 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index 6d406f8196f..d4c7e8cf772 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -20,7 +20,7 @@ namespace operators { enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; -inline BoxCodeType GetBoxCodeType(const std::string& type) { +inline BoxCodeType GetBoxCodeType(const std::string &type) { if (type == "encode_center_size") { return BoxCodeType::kEncodeCenterSize; } else if (type == "decode_center_size") { @@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) { template class BoxCoderKernel : public framework::OpKernel { public: - void EncodeCenterSize(const framework::Tensor* target_box, - const framework::Tensor* prior_box, - const framework::Tensor* prior_box_var, + void EncodeCenterSize(const framework::Tensor *target_box, + const framework::Tensor *prior_box, + const framework::Tensor *prior_box_var, const bool normalized, - const std::vector variance, T* output) const { + const std::vector variance, T *output) const { int64_t row = target_box->dims()[0]; int64_t col = prior_box->dims()[0]; int64_t len = prior_box->dims()[1]; - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - const T* prior_box_var_data = nullptr; - if (prior_box_var) prior_box_var_data = prior_box_var->data(); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { + auto *target_box_data = target_box->data(); + auto *prior_box_data = prior_box->data(); + size_t offset = i * col * len + j * len; T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len] + (normalized == false); T prior_box_height = prior_box_data[j * len + 3] - @@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel { target_box_data[i * len + 1] + (normalized == false); - size_t offset = i * col * len + j * len; output[offset] = (target_box_center_x - prior_box_center_x) / prior_box_width; output[offset + 1] = @@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel { std::log(std::fabs(target_box_width / prior_box_width)); output[offset + 3] = std::log(std::fabs(target_box_height / prior_box_height)); - if (prior_box_var) { - int prior_var_offset = j * len; - output[offset] /= prior_box_var_data[prior_var_offset]; - output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; - output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; - output[offset + 3] /= prior_box_var_data[prior_var_offset + 3]; - } else if (!(variance.empty())) { + } + } + + if (prior_box_var) { + const T *prior_box_var_data = prior_box_var->data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; + int prior_var_offset = j * len; + output[offset + k] /= prior_box_var_data[prior_var_offset + k]; + } + } + } + } else if (!(variance.empty())) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; output[offset + k] /= static_cast(variance[k]); } } } } } + template - void DecodeCenterSize(const framework::Tensor* target_box, - const framework::Tensor* prior_box, - const framework::Tensor* prior_box_var, + void DecodeCenterSize(const framework::Tensor *target_box, + const framework::Tensor *prior_box, + const framework::Tensor *prior_box_var, const bool normalized, std::vector variance, - T* output) const { + T *output) const { int64_t row = target_box->dims()[0]; int64_t col = target_box->dims()[1]; int64_t len = target_box->dims()[2]; - auto* target_box_data = target_box->data(); - auto* prior_box_data = prior_box->data(); - const T* prior_box_var_data = nullptr; - if (var_size == 2) prior_box_var_data = prior_box_var->data(); - int prior_box_offset = 0; - T var_data[4] = {1., 1., 1., 1.}; - T* var_ptr = var_data; #ifdef PADDLE_WITH_MKLML #pragma omp parallel for collapse(2) #endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { + auto *target_box_data = target_box->data(); + auto *prior_box_data = prior_box->data(); + + T var_data[4] = {1., 1., 1., 1.}; + T *var_ptr = var_data; size_t offset = i * col * len + j * len; - prior_box_offset = axis == 0 ? j * len : i * len; + int prior_box_offset = axis == 0 ? j * len : i * len; + T prior_box_width = prior_box_data[prior_box_offset + 2] - prior_box_data[prior_box_offset] + (normalized == false); @@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel { T target_box_width = 0, target_box_height = 0; int prior_var_offset = axis == 0 ? j * len : i * len; if (var_size == 2) { - std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, + std::memcpy(var_ptr, prior_box_var->data() + prior_var_offset, 4 * sizeof(T)); } else if (var_size == 1) { - var_ptr = reinterpret_cast(variance.data()); + var_ptr = reinterpret_cast(variance.data()); } T box_var_x = *var_ptr; T box_var_y = *(var_ptr + 1); @@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel { } } - void Compute(const framework::ExecutionContext& context) const override { - auto* prior_box = context.Input("PriorBox"); - auto* prior_box_var = context.Input("PriorBoxVar"); - auto* target_box = context.Input("TargetBox"); - auto* output_box = context.Output("OutputBox"); + void Compute(const framework::ExecutionContext &context) const override { + auto *prior_box = context.Input("PriorBox"); + auto *prior_box_var = context.Input("PriorBoxVar"); + auto *target_box = context.Input("TargetBox"); + auto *output_box = context.Output("OutputBox"); std::vector variance = context.Attr>("variance"); const int axis = context.Attr("axis"); if (target_box->lod().size()) { @@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel { output_box->mutable_data({row, col, len}, context.GetPlace()); - T* output = output_box->data(); + T *output = output_box->data(); if (code_type == BoxCodeType::kEncodeCenterSize) { EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, variance, output); -- GitLab