提交 decdbed0 编写于 作者: G guomingz 提交者: Tao Luo

resolve #15618 (#16114)

* resolve #15618
Backgroud: the PR #15398 raised the box_coder op performance regression, we optimized the code via the more efficency leveraging opemmp.
上级 1a4a90a8
......@@ -20,7 +20,7 @@ namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) {
inline BoxCodeType GetBoxCodeType(const std::string &type) {
if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") {
......@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
template <typename DeviceContext, typename T>
class BoxCoderKernel : public framework::OpKernel<T> {
public:
void EncodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var,
void EncodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor *prior_box,
const framework::Tensor *prior_box_var,
const bool normalized,
const std::vector<float> variance, T* output) const {
const std::vector<float> variance, T *output) const {
int64_t row = target_box->dims()[0];
int64_t col = prior_box->dims()[0];
int64_t len = prior_box->dims()[1];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
size_t offset = i * col * len + j * len;
T prior_box_width = prior_box_data[j * len + 2] -
prior_box_data[j * len] + (normalized == false);
T prior_box_height = prior_box_data[j * len + 3] -
......@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
target_box_data[i * len + 1] +
(normalized == false);
size_t offset = i * col * len + j * len;
output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] =
......@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height));
if (prior_box_var) {
int prior_var_offset = j * len;
output[offset] /= prior_box_var_data[prior_var_offset];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1];
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2];
output[offset + 3] /= prior_box_var_data[prior_var_offset + 3];
} else if (!(variance.empty())) {
}
}
if (prior_box_var) {
const T *prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
int prior_var_offset = j * len;
output[offset + k] /= prior_box_var_data[prior_var_offset + k];
}
}
}
} else if (!(variance.empty())) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
output[offset + k] /= static_cast<T>(variance[k]);
}
}
}
}
}
template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box,
const framework::Tensor* prior_box,
const framework::Tensor* prior_box_var,
void DecodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor *prior_box,
const framework::Tensor *prior_box_var,
const bool normalized, std::vector<float> variance,
T* output) const {
T *output) const {
int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
T var_data[4] = {1., 1., 1., 1.};
T *var_ptr = var_data;
size_t offset = i * col * len + j * len;
prior_box_offset = axis == 0 ? j * len : i * len;
int prior_box_offset = axis == 0 ? j * len : i * len;
T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] +
(normalized == false);
......@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_width = 0, target_box_height = 0;
int prior_var_offset = axis == 0 ? j * len : i * len;
if (var_size == 2) {
std::memcpy(var_ptr, prior_box_var_data + prior_var_offset,
std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
4 * sizeof(T));
} else if (var_size == 1) {
var_ptr = reinterpret_cast<T*>(variance.data());
var_ptr = reinterpret_cast<T *>(variance.data());
}
T box_var_x = *var_ptr;
T box_var_y = *(var_ptr + 1);
......@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox");
void Compute(const framework::ExecutionContext &context) const override {
auto *prior_box = context.Input<framework::Tensor>("PriorBox");
auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
auto *output_box = context.Output<framework::Tensor>("OutputBox");
std::vector<float> variance = context.Attr<std::vector<float>>("variance");
const int axis = context.Attr<int>("axis");
if (target_box->lod().size()) {
......@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>();
T *output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册