activation_kernel.cpp 5.6 KB
Newer Older
E
eclipsess 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "operators/kernel/activation_kernel.h"
16 17
#include "common/types.h"
#include "operators/math/activation.h"
18 19 20
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
E
eclipsess 已提交
21 22 23 24

namespace paddle_mobile {
namespace operators {

25
template <typename Dtype, ActivationType Act>
26
struct ActivationCompute {
27
  void operator()(const Tensor *input, Tensor *output) {}
28
  void operator()(const Tensor *input, Tensor *output, float alpha) {}
29 30
};

31
template <ActivationType Act>
32
struct ActivationCompute<float, Act> {
33 34 35 36 37 38 39 40
  void operator()(const Tensor *input, Tensor *output) {
    const float *x = input->data<float>();
    float *y = output->mutable_data<float>();
    size_t remain = input->numel();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
    size_t loop = remain >> 4;
    remain = remain & 0xF;

Z
zhaojiaying01 已提交
41
#pragma omp parallel for
42 43 44 45 46 47 48
    for (size_t i = 0; i < loop; ++i) {
      const float *local_x = x + (i << 4);
      float *local_y = y + (i << 4);
      float32x4_t r0 = vld1q_f32(local_x);
      float32x4_t r1 = vld1q_f32(local_x + 4);
      float32x4_t r2 = vld1q_f32(local_x + 8);
      float32x4_t r3 = vld1q_f32(local_x + 12);
49 50 51 52
      r0 = math::vActiveq_f32<Act>(r0);
      r1 = math::vActiveq_f32<Act>(r1);
      r2 = math::vActiveq_f32<Act>(r2);
      r3 = math::vActiveq_f32<Act>(r3);
53 54 55 56 57 58 59 60 61
      vst1q_f32(local_y, r0);
      vst1q_f32(local_y + 4, r1);
      vst1q_f32(local_y + 8, r2);
      vst1q_f32(local_y + 12, r3);
    }
    x += (loop << 4);
    y += (loop << 4);
#endif
    for (size_t i = 0; i < remain; ++i) {
62
      y[i] = math::Active<Act>(x[i]);
63 64
    }
  }
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102

  void operator()(const Tensor *input, Tensor *output, float falpha) {
    const float *x = input->data<float>();
    float *y = output->mutable_data<float>();
    size_t remain = input->numel();
    float alphas[4] = {falpha, falpha, falpha, falpha};
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
    size_t loop = remain >> 4;
    remain = remain & 0xF;

#pragma omp parallel for
    for (size_t i = 0; i < loop; ++i) {
      const float *local_x = x + (i << 4);
      float *local_y = y + (i << 4);
      float32x4_t r0 = vld1q_f32(local_x);
      float32x4_t r1 = vld1q_f32(local_x + 4);
      float32x4_t r2 = vld1q_f32(local_x + 8);
      float32x4_t r3 = vld1q_f32(local_x + 12);
      float32x4_t a_r0 = vld1q_f32(alphas);
      float32x4_t a_r1 = vld1q_f32(alphas);
      float32x4_t a_r2 = vld1q_f32(alphas);
      float32x4_t a_r3 = vld1q_f32(alphas);
      r0 = math::vActiveq_f32<Act>(r0, a_r0);
      r1 = math::vActiveq_f32<Act>(r1, a_r1);
      r2 = math::vActiveq_f32<Act>(r2, a_r2);
      r3 = math::vActiveq_f32<Act>(r3, a_r3);
      vst1q_f32(local_y, r0);
      vst1q_f32(local_y + 4, r1);
      vst1q_f32(local_y + 8, r2);
      vst1q_f32(local_y + 12, r3);
    }
    x += (loop << 4);
    y += (loop << 4);
#endif
    for (size_t i = 0; i < remain; ++i) {
      y[i] = math::Active<Act>(x[i], falpha);
    }
  }
103 104
};

105
#ifdef RELU_OP
L
liuruilong 已提交
106
template <>
N
nhzlx 已提交
107
bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
L
liuruilong 已提交
108 109 110
  return true;
}

E
eclipsess 已提交
111
template <>
L
liuruilong 已提交
112
void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
H
hjchen2 已提交
113 114
  const LoDTensor *input = param.InputX();
  LoDTensor *output = param.Out();
115
  ActivationCompute<float, RELU>()(input, output);
H
hjchen2 已提交
116
  output->set_lod(input->lod());
117 118 119 120 121 122 123 124 125
}

template <>
bool Relu6Kernel<CPU, float>::Init(ReluParam<CPU> *param) {
  return true;
}

template <>
void Relu6Kernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
H
hjchen2 已提交
126 127
  const LoDTensor *input = param.InputX();
  LoDTensor *output = param.Out();
128
  ActivationCompute<float, RELU6>()(input, output);
H
hjchen2 已提交
129
  output->set_lod(input->lod());
E
eclipsess 已提交
130
}
131
#endif
E
eclipsess 已提交
132

133 134 135 136 137 138 139 140
#ifdef SIGMOID_OP
template <>
bool SigmoidKernel<CPU, float>::Init(SigmoidParam<CPU> *param) {
  return true;
}

template <>
void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) {
H
hjchen2 已提交
141 142
  const LoDTensor *input = param.InputX();
  LoDTensor *output = param.Out();
143
  ActivationCompute<float, SIGMOID>()(input, output);
H
hjchen2 已提交
144
  output->set_lod(input->lod());
145 146
}
#endif
L
liuruilong 已提交
147

148 149
#ifdef TANH_OP
template <>
150
bool TanhKernel<CPU, float>::Init(TanhParam<CPU> *param) {
151 152 153 154 155
  return true;
}

template <>
void TanhKernel<CPU, float>::Compute(const TanhParam<CPU> &param) {
H
hjchen2 已提交
156 157
  const LoDTensor *input = param.InputX();
  LoDTensor *output = param.Out();
158
  ActivationCompute<float, TANH>()(input, output);
H
hjchen2 已提交
159
  output->set_lod(input->lod());
160
}
L
liuruilong 已提交
161
#endif
162

163 164 165 166 167 168 169 170
#ifdef LOG_OP
template <>
bool LogKernel<CPU, float>::Init(ReluParam<CPU> *param) {
  return true;
}

template <>
void LogKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
H
hjchen2 已提交
171 172
  const LoDTensor *input = param.InputX();
  LoDTensor *output = param.Out();
173
  ActivationCompute<float, LOG>()(input, output);
H
hjchen2 已提交
174
  output->set_lod(input->lod());
175 176 177
}
#endif

178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
#ifdef LEAKY_RELU_OP
template <>
bool LeakyReluKernel<CPU, float>::Init(LeakyReluParam<CPU> *param) {
  return true;
}

template <>
void LeakyReluKernel<CPU, float>::Compute(const LeakyReluParam<CPU> &param) {
  const LoDTensor *input = param.InputX();
  LoDTensor *output = param.Out();
  ActivationCompute<float, LEAKY_RELU>()(input, output, param.Alpha());
  output->set_lod(input->lod());
}
#endif

193 194
}  // namespace operators
}  // namespace paddle_mobile