activation_op.h 15.0 KB
Newer Older
Q
qijun 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */

#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"

namespace paddle {
namespace operators {

22 23 24
template <typename Place, typename Functor>
class ActivationKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
Q
qijun 已提交
25
 public:
26 27
  using T = typename Functor::ELEMENT_TYPE;

Q
qijun 已提交
28 29 30 31 32 33 34 35 36
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
    auto* Y = context.Output<framework::Tensor>("Y");
    Y->mutable_data<T>(context.GetPlace());

    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
    auto place = context.GetEigenDevice<Place>();
    Functor functor;
37 38 39 40 41

    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
Q
qijun 已提交
42 43 44 45
    functor(place, x, y);
  }
};

46 47 48
template <typename Place, typename Functor>
class ActivationGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
Q
qijun 已提交
49
 public:
50
  using T = typename Functor::ELEMENT_TYPE;
Q
qijun 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
    auto* Y = context.Input<framework::Tensor>("Y");
    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
    dX->mutable_data<T>(context.GetPlace());

    auto dy = framework::EigenVector<T>::Flatten(*dY);
    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
    auto dx = framework::EigenVector<T>::Flatten(*dX);
    auto place = context.GetEigenDevice<Place>();
    Functor functor;
64 65 66 67
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
Q
qijun 已提交
68 69 70 71
    functor(place, x, y, dy, dx);
  }
};

72 73 74 75 76 77 78 79 80
template <typename T>
struct BaseActivationFunctor {
  using ELEMENT_TYPE = T;

  using AttrPair = std::vector<std::pair<const char*, float*>>;

  AttrPair GetAttrs() { return AttrPair(); }
};

81
// sigmoid(x) = 1 / (1 + exp(-x))
Q
qijun 已提交
82
template <typename T>
83
struct SigmoidFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
84
  template <typename Device, typename X, typename Y>
85
  void operator()(Device d, X x, Y y) const {
86
    y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
Q
qijun 已提交
87 88 89
  }
};

90
template <typename T>
91
struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
92
  template <typename Device, typename X, typename Y, typename dY, typename dX>
93
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
94
    dx.device(d) = dy * y * (static_cast<T>(1) - y);
Q
qijun 已提交
95 96 97
  }
};

Q
qijun 已提交
98
// exp(x) = e^x
99 100
template <typename T>
struct ExpFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
101
  template <typename Device, typename X, typename Y>
102
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
103 104 105 106
    y.device(d) = x.exp();
  }
};

107 108
template <typename T>
struct ExpGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
109
  template <typename Device, typename X, typename Y, typename dY, typename dX>
110
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
Q
qijun 已提交
111
    dx.device(d) = dy * y;
Q
qijun 已提交
112 113 114
  }
};

Q
qijun 已提交
115
// relu(x) = max(x, 0)
Q
qijun 已提交
116
template <typename T>
117
struct ReluFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
118
  template <typename Device, typename X, typename Y>
119
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
120 121 122
    y.device(d) = x.cwiseMax(static_cast<T>(0));
  }
};
Q
qijun 已提交
123

Q
qijun 已提交
124
template <typename T>
125
struct ReluGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
126
  template <typename Device, typename X, typename Y, typename dY, typename dX>
127
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
Q
qijun 已提交
128 129 130
    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
  }
};
Q
qijun 已提交
131

132
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
133 134
template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
135
  template <typename Device, typename X, typename Y>
136
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
137 138 139 140 141
    y.device(d) = x.tanh();
  }
};

template <typename T>
142
struct TanhGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
143
  template <typename Device, typename X, typename Y, typename dY, typename dX>
144
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
145
    dx.device(d) = dy * (static_cast<T>(1) - y * y);
Q
qijun 已提交
146 147 148
  }
};

K
Kavya Srinet 已提交
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x - x.tanh();
  }
};

template <typename T>
struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * (x.tanh() * x.tanh());
  }
};

Q
qijun 已提交
167
// sqrt(x) = x^(1/2)
168 169
template <typename T>
struct SqrtFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
170
  template <typename Device, typename X, typename Y>
171
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
172 173 174 175 176
    y.device(d) = x.sqrt();
  }
};

template <typename T>
177
struct SqrtGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
178
  template <typename Device, typename X, typename Y, typename dY, typename dX>
179
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
180
    const Y y_conj = Eigen::numext::conj(y);
Q
qijun 已提交
181 182 183 184
    dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
  }
};

Q
qijun 已提交
185
// abs(x) = |x|
186 187
template <typename T>
struct AbsFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
188
  template <typename Device, typename X, typename Y>
189
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
190 191 192 193
    y.device(d) = x.abs();
  }
};

194 195
template <typename T>
struct AbsGradFunctor : public BaseActivationFunctor<T> {
196
  template <typename Device, typename X, typename Y, typename dY, typename dX>
197
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
198 199 200 201
    dx.device(d) = dy * x.sign();
  }
};

Q
qijun 已提交
202 203
// reciprocal(x) = 1 / x
template <typename T>
204
struct ReciprocalFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
205
  template <typename Device, typename X, typename Y>
206
  void operator()(Device d, X x, Y y) const {
207
    y.device(d) = static_cast<T>(1) / x;
Q
qijun 已提交
208 209 210
  }
};

211
template <typename T>
212
struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
213
  template <typename Device, typename X, typename Y, typename dY, typename dX>
214
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
215
    dx.device(d) = dy * static_cast<T>(-1) * y * y;
Q
qijun 已提交
216 217 218 219
  }
};

// log(x) = natural logarithm of x
220 221
template <typename T>
struct LogFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
222
  template <typename Device, typename X, typename Y>
223
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
224 225 226 227
    y.device(d) = x.log();
  }
};

228
template <typename T>
229
struct LogGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
230
  template <typename Device, typename X, typename Y, typename dY, typename dX>
231
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
232
    dx.device(d) = dy * (static_cast<T>(1) / x);
Q
qijun 已提交
233 234 235 236
  }
};

// square(x) = x^2
237 238
template <typename T>
struct SquareFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
239
  template <typename Device, typename X, typename Y>
240
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
241 242
    y.device(d) = x.square();
  }
243
};
Q
qijun 已提交
244

245
template <typename T>
246
struct SquareGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
247
  template <typename Device, typename X, typename Y, typename dY, typename dX>
248
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
249 250 251 252
    dx.device(d) = dy * static_cast<T>(2) * x;
  }
};

253 254 255 256 257 258 259 260 261 262
template <typename T>
struct BReluFunctor : public BaseActivationFunctor<T> {
  float t_min;
  float t_max;

  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
  // not polymorphism for speed.
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"t_min", &t_min}, {"t_max", &t_max}};
  }
263

264 265 266
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
267 268 269
  }
};

270 271 272 273 274 275 276 277 278 279
template <typename T>
struct BReluGradFunctor : public BaseActivationFunctor<T> {
  float t_min;
  float t_max;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"t_min", &t_min}, {"t_max", &t_max}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
280 281 282
  }
};

283 284
// softsign(x) = x / (1 + |x|)
template <typename T>
285
struct SoftsignFunctor : public BaseActivationFunctor<T> {
286 287 288 289 290 291 292 293 294
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) {
    y.device(d) = x / (static_cast<T>(1) + x.abs());
  }
};

// d(softsign(x))/dx = 1 / (1 + |x|)^2
// Taken from https://en.wikipedia.org/wiki/Activation_function
template <typename T>
295
struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
296 297 298 299 300 301 302
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) {
    dx.device(d) =
        dy * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
  }
};

303 304 305 306 307 308
template <typename T>
struct SoftReluFunctor : public BaseActivationFunctor<T> {
  float threshold;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
309

310 311 312 313
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
314 315 316
  }
};

317 318 319 320 321 322 323 324
template <typename T>
struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  float threshold;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
325
    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
326
    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
327 328 329
  }
};

K
Kavya Srinet 已提交
330 331 332 333 334 335
template <typename T>
struct LeakyReluFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
336

K
Kavya Srinet 已提交
337 338 339
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.cwiseMax(alpha * x);
340 341 342
  }
};

K
Kavya Srinet 已提交
343 344 345 346 347 348 349 350 351 352 353
template <typename T>
struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
354 355 356
  }
};

357 358 359 360 361 362
template <typename T>
struct ELUFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
363

364 365 366
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) =
367 368 369 370 371
        x.cwiseMax(static_cast<T>(0)) +
        (alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
  }
};

372 373 374 375 376 377 378 379 380
template <typename T>
struct ELUGradFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) =
381 382 383 384 385
        dy * (x > static_cast<T>(0)).template cast<T>() +
        dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
  }
};

386 387 388 389 390 391 392 393 394
template <typename T>
struct PowFunctor : public BaseActivationFunctor<T> {
  float factor;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"factor", &factor}};
  }
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.pow(factor);
395 396 397
  }
};

398 399 400 401 402 403 404 405 406
template <typename T>
struct PowGradFunctor : public BaseActivationFunctor<T> {
  float factor;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"factor", &factor}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
407 408 409
  }
};

410 411 412 413 414 415 416
template <typename T>
struct STanhFunctor : public BaseActivationFunctor<T> {
  float scale_a;
  float scale_b;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
  }
417

418 419 420
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = scale_b * (scale_a * x).tanh();
421 422 423
  }
};

424 425 426 427 428 429 430
template <typename T>
struct STanhGradFunctor : public BaseActivationFunctor<T> {
  float scale_a;
  float scale_b;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
  }
431

432 433
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
434
    auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
435
    dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
Q
qijun 已提交
436 437 438
  }
};

Q
qijun 已提交
439 440
}  // namespace operators
}  // namespace paddle
441

442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459
#define FOR_EACH_KERNEL_FUNCTOR(__macro)                          \
  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);           \
  __macro(exp, ExpFunctor, ExpGradFunctor);                       \
  __macro(relu, ReluFunctor, ReluGradFunctor);                    \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                    \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                    \
  __macro(abs, AbsFunctor, AbsGradFunctor);                       \
  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
  __macro(log, LogFunctor, LogGradFunctor);                       \
  __macro(square, SquareFunctor, SquareGradFunctor);              \
  __macro(brelu, BReluFunctor, BReluGradFunctor);                 \
  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);       \
  __macro(pow, PowFunctor, PowGradFunctor);                       \
  __macro(stanh, STanhFunctor, STanhGradFunctor);                 \
  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);        \
  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
  __macro(elu, ELUFunctor, ELUGradFunctor)