提交 630c1e83 编写于 作者: G guomingz 提交者: Tao Luo

This PR improve performance of prior_box op about 1.25x faster on CPU. (#15909)

* This PR improve performance of prior_box op about 1.25x faster on CPU.

* Test Env:SKX 8180 with fake data on 28 threads(bs=1).
* The below table shows the ~25% improvement which generated by [eval_tp_fake_data.py](https://github.com/PaddlePaddle/Paddle/issues/15618#issuecomment-464613976).

| Type |Event | Calls |   Total     |  Min.    |   Max.      |  Ave.      |  Ratio.|
| ---------------- | ------------------ | ---- | ------- | -------- | -------- | ------------ | -------- |
| w/ optimization  | thread0::prior_box | 6000 | 921.201 | 0.110572 | 0.383402 | **0.153533** | 0.084585 |
| w/o optimization | thread0::prior_box | 6000 | 1151.85 | 0.102276 | 0.426702 | **0.191976** | 0.103337 |

test=develop

* Fix the style issue.

test=develop
上级 9c05421c
......@@ -172,6 +172,10 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
framework::make_ddim({1, static_cast<int>(variances.size())}),
ctx.GetPlace());
auto var_et = framework::EigenTensor<T, 2>::From(var_t);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t i = 0; i < variances.size(); ++i) {
var_et(0, i) = variances[i];
}
......@@ -181,8 +185,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
vars->Resize({box_num, static_cast<int>(variances.size())});
auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int i = 0; i < box_num; ++i) {
for (int j = 0; j < variances.size(); ++j) {
e_vars(i, j) = variances[j];
}
}
vars->Resize(var_dim);
}
}; // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册