提交 7c59db45 编写于 作者: E eclipsess

optimize some ops

上级 770c8d85
...@@ -65,16 +65,17 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { ...@@ -65,16 +65,17 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
/// ((x - est_mean) * (inv_var) * scale + bias equal to /// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) { for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{ {
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) { for (int h = 0; h < H; h++) {
for (int w = 0; w < W; w++) { int tmp_index = n * stride0 + i * stride1 + h * stride2;
int index = n * stride0 + i * stride1 + h * stride2 + w; for (int w = 0; w < W; w++) {
out_ptr[index] = int index = tmp_index + w;
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i]; out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
} }
} }
} }
......
...@@ -84,33 +84,36 @@ void StridedNumelCopyWithAxis(int64_t axis, T *dst, ...@@ -84,33 +84,36 @@ void StridedNumelCopyWithAxis(int64_t axis, T *dst,
} }
} }
template <> template <>
void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const { void ConcatKernel<CPU, float>::Compute(const ConcatParam &param) const {
auto inputs = param.Inputs(); auto inputs = param.Inputs();
auto *out = param.Out(); auto *out = param.Out();
int64_t axis = param.Axis(); int64_t axis = param.Axis();
out->mutable_data<float>(); out->mutable_data<float>();
/// Sometimes direct copies will be faster, this maybe need deeply analysis. /// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && inputs.size() < 10) { if (axis == 0 && inputs.size() < 10) {
size_t output_offset = 0; size_t output_offset = 0;
for (auto *in : inputs) { for (auto *in : inputs) {
auto in_stride = framework::stride_numel(in->dims()); auto in_stride = framework::stride_numel(in->dims());
auto out_stride = framework::stride_numel(out->dims()); auto out_stride = framework::stride_numel(out->dims());
StridedNumelCopyWithAxis<float>(axis, out->data<float>() + output_offset, auto dst = out->data<float>() + output_offset;
out_stride, in->data<float>(), in_stride, auto src = in->data<float>();
in_stride[axis]); PADDLE_MOBILE_ENFORCE(
output_offset += in_stride[axis]; in_stride.size() == out_stride.size(),
} "src and dst tensor should have the same dims size.");
} else { memory::Copy(dst, src, sizeof(float) * in_stride[0]);
std::vector<framework::Tensor> inputs_concat(inputs.size()); output_offset += in_stride[0];
for (int j = 0; j < inputs.size(); ++j) { }
inputs_concat[j] = *inputs[j]; } else {
std::vector<framework::Tensor> inputs_concat(inputs.size());
for (int j = 0; j < inputs.size(); ++j) {
inputs_concat[j] = *inputs[j];
}
ConcatFunctor<float> concat_functor;
concat_functor(inputs_concat, static_cast<int>(axis), out);
}
} }
ConcatFunctor<float> concat_functor;
concat_functor(inputs_concat, static_cast<int>(axis), out);
}
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,20 +14,30 @@ limitations under the License. */ ...@@ -14,20 +14,30 @@ limitations under the License. */
#pragma once #pragma once
#include <operators/math/transform.h>
#include "operators/kernel/relu_kernel.h" #include "operators/kernel/relu_kernel.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <typename T>
void ReluKernel<CPU, float>::Compute(const ReluParam &param) const { struct ReluFunctor {
const auto *input_x = param.InputX(); inline T operator()(T in) const { return in > 0 ? in : 0; }
auto *input_x_ptr = input_x->data<float>(); };
auto *out = param.Out(); template <>
auto *out_ptr = out->mutable_data<float>(); void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
for (int i = 0; i < input_x->numel(); i++) { const auto *input_x = param.InputX();
out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0; auto *input_x_ptr = input_x->data<float>();
} auto *out = param.Out();
} auto *out_ptr = out->mutable_data<float>();
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
// for (int i = 0; i < input_x->numel(); i++) {
// out_ptr[i] = input_x_ptr[i] > 0 ? input_x_ptr[i] : 0;
// }
}
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
\ No newline at end of file
...@@ -42,12 +42,13 @@ struct LRNFunctor { ...@@ -42,12 +42,13 @@ struct LRNFunctor {
for (int index = start; index < end; index++) { for (int index = start; index < end; index++) {
int channel = b + index; int channel = b + index;
if (channel >= 0 && channel < C) { if (channel >= 0 && channel < C) {
int tmp_u = a * stride0 + b * stride1;
int tmp_i = a * stride0 + channel * stride1;
for (int c = 0; c < H; c++) { for (int c = 0; c < H; c++) {
for (int d = 0; d < W; d++) { for (int d = 0; d < W; d++) {
int u = a * stride0 + b * stride1 + c * stride2 + d; int tmp = c * stride2 + d;
int u = tmp_u + tmp;
int i = a * stride0 + channel * stride1 + c * stride2 + d; int i = tmp_i + tmp;
sqr_buffer_ptr[u] += alpha * input_ptr[i] * input_ptr[i]; sqr_buffer_ptr[u] += alpha * input_ptr[i] * input_ptr[i];
} }
} }
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0) #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
/* /*
* Out = X ⊙ Y * Out = X ⊙ Y
...@@ -31,180 +31,148 @@ namespace operators { ...@@ -31,180 +31,148 @@ namespace operators {
* pre=2*3, n=4*5, post=1 * pre=2*3, n=4*5, post=1
* x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
*/ */
inline void get_mid_dims(const framework::DDim &x_dims, inline void get_mid_dims(const framework::DDim &x_dims,
const framework::DDim &y_dims, const int axis, const framework::DDim &y_dims, const int axis,
int *pre, int *n, int *post) { int *pre, int *n, int *post) {
*pre = 1; *pre = 1;
*n = 1; *n = 1;
*post = 1; *post = 1;
// compute pre // compute pre
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
(*pre) *= x_dims[i]; (*pre) *= x_dims[i];
} }
for (int i = 0; i < y_dims.size(); ++i) { for (int i = 0; i < y_dims.size(); ++i) {
assert(x_dims[i + axis] == y_dims[i]); assert(x_dims[i + axis] == y_dims[i]);
/// "Broadcast dimension mismatch."); /// "Broadcast dimension mismatch.");
(*n) *= y_dims[i]; (*n) *= y_dims[i];
} }
for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
(*post) *= x_dims[i]; (*post) *= x_dims[i];
} }
} }
/// remove dims tail 1. (4,20,1,1) -> (4,20) /// remove dims tail 1. (4,20,1,1) -> (4,20)
inline void trim_trailing_singular_dims(framework::DDim *dims) { inline void trim_trailing_singular_dims(framework::DDim *dims) {
// Remove trailing dimensions of size 1 for y // Remove trailing dimensions of size 1 for y
auto actual_dims_size = dims->size(); auto actual_dims_size = dims->size();
for (; actual_dims_size != 0; --actual_dims_size) { for (; actual_dims_size != 0; --actual_dims_size) {
if ((*dims)[actual_dims_size - 1] != 1) break; if ((*dims)[actual_dims_size - 1] != 1) break;
} }
if (actual_dims_size != dims->size()) { if (actual_dims_size != dims->size()) {
auto actual_dims = framework::vectorize(*dims); auto actual_dims = framework::vectorize(*dims);
actual_dims.resize(actual_dims_size); actual_dims.resize(actual_dims_size);
*dims = framework::make_ddim(actual_dims); *dims = framework::make_ddim(actual_dims);
} }
} }
template <typename T>
class RowwiseTransformIterator {
public:
RowwiseTransformIterator(const T *ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
RowwiseTransformIterator<T> &operator++() {
++i_;
if (UNLIKELY(i_ == n_)) {
i_ = 0;
}
return *this;
}
bool operator==(const RowwiseTransformIterator<T> &rhs) const {
return (ptr_ + i_) == &(*rhs);
}
bool operator!=(const RowwiseTransformIterator<T> &rhs) const {
return (ptr_ + i_) != &(*rhs);
}
const T &operator*() { return ptr_[i_]; }
private:
const T *ptr_;
int i_;
int64_t n_;
};
/// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last /// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last
/// dimension /// dimension
/// in (4,20,2) is 2 , /// in (4,20,2) is 2 ,
/// (20,1) move 1 stride , to fill(add) 2 element with the same number. /// (20,1) move 1 stride , to fill(add) 2 element with the same number.
template <typename T> template <typename T>
class MidWiseTransformIterator { class MidWiseTransformIterator {
public: public:
MidWiseTransformIterator(const T *ptr, int n, int post) MidWiseTransformIterator(const T *ptr, int n, int post)
: ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
MidWiseTransformIterator<T> &operator++() { MidWiseTransformIterator<T> &operator++() {
++j_; if (post_ != 1) {
if (UNLIKELY(j_ == post_)) { ++j_;
++i_; if (UNLIKELY(j_ == post_)) {
j_ = 0; ++i_;
if (UNLIKELY(i_ == n_)) { j_ = 0;
i_ = 0; if (UNLIKELY(i_ == n_)) {
} i_ = 0;
} }
return *this; }
} return *this;
} else {
bool operator==(const MidWiseTransformIterator<T> &rhs) const { ++i_;
return (ptr_ + i_) == &(*rhs); if (UNLIKELY(i_ == n_)) {
} i_ = 0;
}
bool operator!=(const MidWiseTransformIterator<T> &rhs) const { return *this;
return (ptr_ + i_) != &(*rhs); }
} }
const T &operator*() { return ptr_[i_]; } bool operator==(const MidWiseTransformIterator<T> &rhs) const {
return (ptr_ + i_) == &(*rhs);
private: }
const T *ptr_;
int64_t i_; bool operator!=(const MidWiseTransformIterator<T> &rhs) const {
int64_t j_; return (ptr_ + i_) != &(*rhs);
int64_t n_; }
int64_t post_;
}; const T &operator*() { return ptr_[i_]; }
template <typename Functor, typename T, typename OutType = T> private:
class TransformFunctor { const T *ptr_;
public: int64_t i_;
TransformFunctor(const framework::Tensor *x, const framework::Tensor *y, int64_t j_;
framework::Tensor *z, Functor func) int64_t n_;
: x_(x->data<T>()), int64_t post_;
y_(y->data<T>()), };
z_(z->mutable_data<OutType>()),
nx_(x->numel()), template <typename Functor, typename T, typename OutType = T>
func_(func) {} class TransformFunctor {
public:
inline void Run() const { TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
math::Transform trans; framework::Tensor *z, Functor func)
// 同时执行func(x_, y_)传入z_。 : x_(x->data<T>()),
trans(x_, x_ + nx_, y_, z_, func_); y_(y->data<T>()),
} z_(z->mutable_data<OutType>()),
nx_(x->numel()),
inline void RunRowWise(int n, int pre) const { func_(func) {}
math::Transform trans;
trans(x_, x_ + nx_, RowwiseTransformIterator<T>(y_, n), z_, func_); inline void Run() const {
} math::Transform trans;
// 同时执行func(x_, y_)传入z_。
inline void RunMidWise(int n, int pre, int post) const { trans(x_, x_ + nx_, y_, z_, func_);
math::Transform trans; }
trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post), z_, func_);
} inline void RunMidWise(int n, int pre, int post) const {
math::Transform trans;
private: trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post), z_, func_);
const T *x_; }
const T *y_;
OutType *z_; private:
int64_t nx_; const T *x_;
Functor func_; const T *y_;
}; OutType *z_;
int64_t nx_;
template <typename Functor, typename T, typename OutType = T> Functor func_;
void ElementwiseComputeEx(const framework::Tensor *x, };
const framework::Tensor *y, int axis, Functor func,
framework::Tensor *z) { template <typename Functor, typename T, typename OutType = T>
TransformFunctor<Functor, T, OutType> functor(x, y, z, func); void ElementwiseComputeEx(const framework::Tensor *x,
const framework::Tensor *y, int axis, Functor func,
auto x_dims = x->dims(); framework::Tensor *z) {
auto y_dims = y->dims(); TransformFunctor<Functor, T, OutType> functor(x, y, z, func);
// PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
// "Rank of first input must >= rank of second auto x_dims = x->dims();
// input."); auto y_dims = y->dims();
PADDLE_MOBILE_ENFORCE(x_dims.size() >= y_dims.size(),
if (x_dims == y_dims) { "Rank of first input must >= rank of second input.");
functor.Run();
return; if (x_dims == y_dims) {
} functor.Run();
return;
/// axis = -1 represent the last dimension. }
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
// PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), /// axis = -1 represent the last dimensions.
// "Axis should be in range [0, x_dims)"); axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
trim_trailing_singular_dims(&y_dims); PADDLE_MOBILE_ENFORCE(axis >= 0 && axis < x_dims.size(),
axis = (y_dims.size() == 0) ? x_dims.size() : axis; "Axis should be in range [0, x_dims)");
trim_trailing_singular_dims(&y_dims);
int pre, n, post; axis = (y_dims.size() == 0) ? x_dims.size() : axis;
get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
if (post == 1) { int pre, n, post;
functor.RunRowWise(n, pre); get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
return;
} else { functor.RunMidWise(n, pre, post);
functor.RunMidWise(n, pre, post); }
return;
} } // namespace operators
}
} // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册