activation_op.h 16.0 KB
Newer Older
Q
qijun 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */

#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"

namespace paddle {
namespace operators {

22 23 24
template <typename Place, typename Functor>
class ActivationKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
Q
qijun 已提交
25
 public:
26 27
  using T = typename Functor::ELEMENT_TYPE;

Q
qijun 已提交
28 29 30 31 32 33 34 35 36
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
    auto* Y = context.Output<framework::Tensor>("Y");
    Y->mutable_data<T>(context.GetPlace());

    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
    auto place = context.GetEigenDevice<Place>();
    Functor functor;
37 38 39 40 41

    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
Q
qijun 已提交
42 43 44 45
    functor(place, x, y);
  }
};

46 47 48
template <typename Place, typename Functor>
class ActivationGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
Q
qijun 已提交
49
 public:
50
  using T = typename Functor::ELEMENT_TYPE;
Q
qijun 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<framework::Tensor>("X");
    auto* Y = context.Input<framework::Tensor>("Y");
    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
    dX->mutable_data<T>(context.GetPlace());

    auto dy = framework::EigenVector<T>::Flatten(*dY);
    auto x = framework::EigenVector<T>::Flatten(*X);
    auto y = framework::EigenVector<T>::Flatten(*Y);
    auto dx = framework::EigenVector<T>::Flatten(*dX);
    auto place = context.GetEigenDevice<Place>();
    Functor functor;
64 65 66 67
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = context.Attr<float>(attr.first);
    }
Q
qijun 已提交
68 69 70 71
    functor(place, x, y, dy, dx);
  }
};

72 73 74 75 76 77 78 79 80
template <typename T>
struct BaseActivationFunctor {
  using ELEMENT_TYPE = T;

  using AttrPair = std::vector<std::pair<const char*, float*>>;

  AttrPair GetAttrs() { return AttrPair(); }
};

81
// sigmoid(x) = 1 / (1 + exp(-x))
Q
qijun 已提交
82
template <typename T>
83
struct SigmoidFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
84
  template <typename Device, typename X, typename Y>
85
  void operator()(Device d, X x, Y y) const {
86
    y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
Q
qijun 已提交
87 88 89
  }
};

90
template <typename T>
91
struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
92
  template <typename Device, typename X, typename Y, typename dY, typename dX>
93
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
94
    dx.device(d) = dy * y * (static_cast<T>(1) - y);
Q
qijun 已提交
95 96 97
  }
};

Q
qijun 已提交
98
// exp(x) = e^x
99 100
template <typename T>
struct ExpFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
101
  template <typename Device, typename X, typename Y>
102
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
103 104 105 106
    y.device(d) = x.exp();
  }
};

107 108
template <typename T>
struct ExpGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
109
  template <typename Device, typename X, typename Y, typename dY, typename dX>
110
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
Q
qijun 已提交
111
    dx.device(d) = dy * y;
Q
qijun 已提交
112 113 114
  }
};

Q
qijun 已提交
115
// relu(x) = max(x, 0)
Q
qijun 已提交
116
template <typename T>
117
struct ReluFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
118
  template <typename Device, typename X, typename Y>
119
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
120 121 122
    y.device(d) = x.cwiseMax(static_cast<T>(0));
  }
};
Q
qijun 已提交
123

Q
qijun 已提交
124
template <typename T>
125
struct ReluGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
126
  template <typename Device, typename X, typename Y, typename dY, typename dX>
127
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
Q
qijun 已提交
128 129 130
    dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
  }
};
Q
qijun 已提交
131

132
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
133 134
template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
135
  template <typename Device, typename X, typename Y>
136
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
137 138 139 140 141
    y.device(d) = x.tanh();
  }
};

template <typename T>
142
struct TanhGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
143
  template <typename Device, typename X, typename Y, typename dY, typename dX>
144
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
145
    dx.device(d) = dy * (static_cast<T>(1) - y * y);
Q
qijun 已提交
146 147 148
  }
};

K
Kavya Srinet 已提交
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166
// tanhshrink(x) = x - tanh(x)
// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x - x.tanh();
  }
};

template <typename T>
struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * (x.tanh() * x.tanh());
  }
};

Q
qijun 已提交
167
// sqrt(x) = x^(1/2)
168 169
template <typename T>
struct SqrtFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
170
  template <typename Device, typename X, typename Y>
171
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
172 173 174 175 176
    y.device(d) = x.sqrt();
  }
};

template <typename T>
177
struct SqrtGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
178
  template <typename Device, typename X, typename Y, typename dY, typename dX>
179
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
180
    const Y y_conj = Eigen::numext::conj(y);
Q
qijun 已提交
181 182 183 184
    dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
  }
};

Q
qijun 已提交
185
// abs(x) = |x|
186 187
template <typename T>
struct AbsFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
188
  template <typename Device, typename X, typename Y>
189
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
190 191 192 193
    y.device(d) = x.abs();
  }
};

194 195
template <typename T>
struct AbsGradFunctor : public BaseActivationFunctor<T> {
196
  template <typename Device, typename X, typename Y, typename dY, typename dX>
197
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
198 199 200 201
    dx.device(d) = dy * x.sign();
  }
};

Q
qijun 已提交
202 203
// reciprocal(x) = 1 / x
template <typename T>
204
struct ReciprocalFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
205
  template <typename Device, typename X, typename Y>
206
  void operator()(Device d, X x, Y y) const {
207
    y.device(d) = static_cast<T>(1) / x;
Q
qijun 已提交
208 209 210
  }
};

211
template <typename T>
212
struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
213
  template <typename Device, typename X, typename Y, typename dY, typename dX>
214
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
215
    dx.device(d) = dy * static_cast<T>(-1) * y * y;
Q
qijun 已提交
216 217 218 219
  }
};

// log(x) = natural logarithm of x
220 221
template <typename T>
struct LogFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
222
  template <typename Device, typename X, typename Y>
223
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
224 225 226 227
    y.device(d) = x.log();
  }
};

228
template <typename T>
229
struct LogGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
230
  template <typename Device, typename X, typename Y, typename dY, typename dX>
231
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
232
    dx.device(d) = dy * (static_cast<T>(1) / x);
Q
qijun 已提交
233 234 235 236
  }
};

// square(x) = x^2
237 238
template <typename T>
struct SquareFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
239
  template <typename Device, typename X, typename Y>
240
  void operator()(Device d, X x, Y y) const {
Q
qijun 已提交
241 242
    y.device(d) = x.square();
  }
243
};
Q
qijun 已提交
244

245
template <typename T>
246
struct SquareGradFunctor : public BaseActivationFunctor<T> {
Q
qijun 已提交
247
  template <typename Device, typename X, typename Y, typename dY, typename dX>
248
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
249 250 251 252
    dx.device(d) = dy * static_cast<T>(2) * x;
  }
};

253 254 255 256 257 258 259 260 261 262
template <typename T>
struct BReluFunctor : public BaseActivationFunctor<T> {
  float t_min;
  float t_max;

  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
  // not polymorphism for speed.
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"t_min", &t_min}, {"t_max", &t_max}};
  }
263

264 265 266
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
267 268 269
  }
};

270 271 272 273 274 275 276 277 278 279
template <typename T>
struct BReluGradFunctor : public BaseActivationFunctor<T> {
  float t_min;
  float t_max;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"t_min", &t_min}, {"t_max", &t_max}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
280 281 282
  }
};

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312
// relu6(x) = min(max(0, x), 6)
template <typename T>
struct Relu6Functor : public BaseActivationFunctor<T> {
  float threshold;

  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
  // not polymorphism for speed.
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }

  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.cwiseMax(static_cast<T>(0)).cwiseMin(threshold);
  }
};

template <typename T>
struct Relu6GradFunctor : public BaseActivationFunctor<T> {
  float threshold;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) =
        dy * ((x > static_cast<T>(0)) * (x < threshold)).template cast<T>();
  }
};

313 314
// softsign(x) = x / (1 + |x|)
template <typename T>
315
struct SoftsignFunctor : public BaseActivationFunctor<T> {
316 317 318 319 320 321 322 323 324
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) {
    y.device(d) = x / (static_cast<T>(1) + x.abs());
  }
};

// d(softsign(x))/dx = 1 / (1 + |x|)^2
// Taken from https://en.wikipedia.org/wiki/Activation_function
template <typename T>
325
struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
326 327 328 329 330 331 332
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) {
    dx.device(d) =
        dy * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
  }
};

333 334 335 336 337 338
template <typename T>
struct SoftReluFunctor : public BaseActivationFunctor<T> {
  float threshold;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
339

340 341 342 343
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
344 345 346
  }
};

347 348 349 350 351 352 353 354
template <typename T>
struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
  float threshold;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
355
    auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
356
    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
357 358 359
  }
};

K
Kavya Srinet 已提交
360 361 362 363 364 365
template <typename T>
struct LeakyReluFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
366

K
Kavya Srinet 已提交
367 368 369
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.cwiseMax(alpha * x);
370 371 372
  }
};

K
Kavya Srinet 已提交
373 374 375 376 377 378 379 380 381 382 383
template <typename T>
struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
384 385 386
  }
};

387 388 389 390 391 392
template <typename T>
struct ELUFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
393

394 395 396
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) =
397 398 399 400 401
        x.cwiseMax(static_cast<T>(0)) +
        (alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
  }
};

402 403 404 405 406 407 408 409 410
template <typename T>
struct ELUGradFunctor : public BaseActivationFunctor<T> {
  float alpha;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"alpha", &alpha}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) =
411 412 413 414 415
        dy * (x > static_cast<T>(0)).template cast<T>() +
        dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
  }
};

416 417 418 419 420 421 422 423 424
template <typename T>
struct PowFunctor : public BaseActivationFunctor<T> {
  float factor;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"factor", &factor}};
  }
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = x.pow(factor);
425 426 427
  }
};

428 429 430 431 432 433 434 435 436
template <typename T>
struct PowGradFunctor : public BaseActivationFunctor<T> {
  float factor;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"factor", &factor}};
  }
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
    dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
437 438 439
  }
};

440 441 442 443 444 445 446
template <typename T>
struct STanhFunctor : public BaseActivationFunctor<T> {
  float scale_a;
  float scale_b;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
  }
447

448 449 450
  template <typename Device, typename X, typename Y>
  void operator()(Device d, X x, Y y) const {
    y.device(d) = scale_b * (scale_a * x).tanh();
451 452 453
  }
};

454 455 456 457 458 459 460
template <typename T>
struct STanhGradFunctor : public BaseActivationFunctor<T> {
  float scale_a;
  float scale_b;
  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
  }
461

462 463
  template <typename Device, typename X, typename Y, typename dY, typename dX>
  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
464
    auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
465
    dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
Q
qijun 已提交
466 467 468
  }
};

Q
qijun 已提交
469 470
}  // namespace operators
}  // namespace paddle
471

472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
#define FOR_EACH_KERNEL_FUNCTOR(__macro)                          \
  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);           \
  __macro(exp, ExpFunctor, ExpGradFunctor);                       \
  __macro(relu, ReluFunctor, ReluGradFunctor);                    \
  __macro(tanh, TanhFunctor, TanhGradFunctor);                    \
  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                    \
  __macro(abs, AbsFunctor, AbsGradFunctor);                       \
  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
  __macro(log, LogFunctor, LogGradFunctor);                       \
  __macro(square, SquareFunctor, SquareGradFunctor);              \
  __macro(brelu, BReluFunctor, BReluGradFunctor);                 \
  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);       \
  __macro(pow, PowFunctor, PowGradFunctor);                       \
  __macro(stanh, STanhFunctor, STanhGradFunctor);                 \
  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);        \
  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);    \
Z
zhouxiao-coder 已提交
488
  __macro(relu6, Relu6Functor, Relu6GradFunctor);                 \
489 490
  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
  __macro(elu, ELUFunctor, ELUGradFunctor)