activation_op.kps 22.7 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
L
Luo Tao 已提交
2 3 4 5 6 7 8 9 10
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
L
liaogang 已提交
11

Y
Yi Wang 已提交
12
#include "paddle/fluid/operators/activation_op.h"
13 14
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
15
#include "paddle/fluid/platform/bfloat16.h"
16
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
17 18
#include "paddle/phi/kernels/funcs/activation_functor.h"

19 20 21
namespace paddle {
namespace operators {

22 23 24 25 26 27 28 29 30 31 32 33
template <typename T>
struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  MPType one = static_cast<MPType>(1.0f);
  float threshold;

  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }

  // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
  // threshold should not be negative
34
  __device__ __forceinline__ T operator()(const T arg_x) const {
35
    MPType x = static_cast<MPType>(arg_x);
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
    MPType t = static_cast<MPType>(threshold);
    MPType temp_min = x < t ? x : t;
    MPType temp_max = temp_min > -t ? temp_min : -t;
    return static_cast<T>(log(one + exp(temp_max)));
  }
};

template <typename T>
struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  MPType one = static_cast<MPType>(1.0f);
  float threshold;

  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }

  // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
  // threshold should not be negative
55 56
  __device__ __forceinline__ T operator()(const T arg_dout,
                                          const T arg_out) const {
57 58
    MPType dout = static_cast<MPType>(arg_dout);
    MPType out = static_cast<MPType>(arg_out);
59 60 61 62 63
    MPType t = static_cast<MPType>(threshold);
    return (out > -t && out < t) ? static_cast<T>(dout * (one - exp(-out)))
                                 : static_cast<T>(0.0f);
  }

64 65 66
  static constexpr ActBwdOpFwdDeps FwdDeps() {
    return ActBwdOpFwdDeps::kDepOut;
  }
67 68
};

69
template <typename DeviceContext, typename Functor>
70
class ActivationCudaKernel
71 72 73
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
  using T = typename Functor::ELEMENT_TYPE;
74 75
  void Compute(const framework::ExecutionContext& ctx) const override {
    const framework::Tensor* x = nullptr;
76
    framework::Tensor* out = nullptr;
77 78 79 80 81 82
    ExtractActivationTensor(ctx, &x, &out);
    out->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    std::vector<const framework::Tensor*> ins = {x};
    std::vector<framework::Tensor*> outs = {out};
    auto functor = Functor();
83 84
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
85
      *attr.second = ctx.Attr<float>(attr.first);
86
    }
87 88
    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
        dev_ctx, ins, &outs, functor);
89 90 91 92
  }
};

template <typename DeviceContext, typename Functor>
93
class ActivationGradCudaKernel
94 95 96
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
  using T = typename Functor::ELEMENT_TYPE;
97
  void Compute(const framework::ExecutionContext& ctx) const override {
98 99 100
    const framework::Tensor *x, *out, *d_out;
    framework::Tensor* d_x = nullptr;
    x = out = d_out = nullptr;
101 102
    ExtractActivationGradTensor<Functor::FwdDeps()>(
        ctx, &x, &out, &d_out, &d_x);
103 104 105 106 107 108 109 110 111 112
    d_x->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto functor = Functor();
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = ctx.Attr<float>(attr.first);
    }

    std::vector<const framework::Tensor*> ins = {d_out};
    std::vector<framework::Tensor*> outs = {d_x};
113

114 115
    if (static_cast<int>(Functor::FwdDeps()) ==
        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
116
      // Only need forward output Out
117
      ins.push_back(out);
118 119
      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
          dev_ctx, ins, &outs, functor);
120
    } else if (static_cast<int>(Functor::FwdDeps()) ==
121
               static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
122
      // Only need forward input X
123
      ins.push_back(x);
124 125
      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
          dev_ctx, ins, &outs, functor);
126
    } else {
127 128
      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
          dev_ctx, ins, &outs, functor);
129 130 131 132
    }
  }
};

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
USE_PHI_FUNCTOR(CudaCos)
USE_PHI_FUNCTOR(CudaTan)
USE_PHI_FUNCTOR(CudaAcos)
USE_PHI_FUNCTOR(CudaSin)
USE_PHI_FUNCTOR(CudaAsin)
USE_PHI_FUNCTOR(CudaAtan)
USE_PHI_FUNCTOR(CudaSinh)
USE_PHI_FUNCTOR(CudaCosh)
USE_PHI_FUNCTOR(CudaAsinh)
USE_PHI_FUNCTOR(CudaAcosh)
USE_PHI_FUNCTOR(CudaAtanh)
USE_PHI_FUNCTOR(CudaTanh)
USE_PHI_FUNCTOR(CudaBRelu)
USE_PHI_FUNCTOR(CudaLeakyRelu)
USE_PHI_FUNCTOR(CudaThresholdedRelu)
148
USE_PHI_FUNCTOR(CudaRelu6)
Y
YuanRisheng 已提交
149 150 151 152 153
USE_PHI_FUNCTOR(CudaHardShrink)
USE_PHI_FUNCTOR(CudaSoftShrink)
USE_PHI_FUNCTOR(CudaTanhShrink)
USE_PHI_FUNCTOR(CudaSilu)
USE_PHI_FUNCTOR(CudaELU)
154
USE_PHI_FUNCTOR(CudaSoftsign)
Y
YuanRisheng 已提交
155 156 157
USE_PHI_FUNCTOR(CudaSigmoid)
USE_PHI_FUNCTOR(CudaLogSigmoid)
USE_PHI_FUNCTOR(CudaHardSigmoid)
158 159 160 161
USE_PHI_FUNCTOR(CudaLog)
USE_PHI_FUNCTOR(CudaLog2)
USE_PHI_FUNCTOR(CudaLog10)
USE_PHI_FUNCTOR(CudaLog1p)
Y
YuanRisheng 已提交
162 163 164 165 166 167 168 169 170 171 172 173 174 175
USE_PHI_FUNCTOR(CudaSwish)
USE_PHI_FUNCTOR(CudaHardSwish)

template <typename T>
using CudaRoundFunctor = phi::funcs::CudaRoundFunctor<T>;

template <typename T>
using CudaFloorFunctor = phi::funcs::CudaFloorFunctor<T>;

template <typename T>
using CudaCeilFunctor = phi::funcs::CudaCeilFunctor<T>;

template <typename T>
using CudaZeroGradFunctor = phi::funcs::CudaZeroGradFunctor<T>;
Y
YuanRisheng 已提交
176

177 178 179 180 181 182 183 184 185 186
USE_PHI_FUNCTOR(CudaExp)
USE_PHI_FUNCTOR(CudaExpm1)
USE_PHI_FUNCTOR(CudaMish)
USE_PHI_FUNCTOR(CudaSTanh)
USE_PHI_FUNCTOR(CudaReciprocal)
USE_PHI_FUNCTOR(CudaSquare)
USE_PHI_FUNCTOR(CudaSqrt)
USE_PHI_FUNCTOR(CudaRsqrt)
USE_PHI_FUNCTOR(CudaSoftplus)

Y
YuanRisheng 已提交
187 188 189
template <typename T>
using CudaELUGradNegativeAlphaFunctor =
    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
190

191 192 193
}  // namespace operators
}  // namespace paddle

194
namespace ops = paddle::operators;
195 196
namespace plat = paddle::platform;

L
Leo Chen 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
#define REGISTER_ACTIVATION_CUDA_KERNEL(                                       \
    act_type, op_name, functor, grad_functor)                                  \
  REGISTER_OP_CUDA_KERNEL(                                                     \
      act_type,                                                                \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>,         \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>,        \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
      ops::ActivationCudaKernel<phi::GPUContext,                               \
                                ops::functor<plat::bfloat16>>);                \
  REGISTER_OP_CUDA_KERNEL(                                                     \
      act_type##_grad,                                                         \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<float>>,                 \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<double>>,                \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<plat::float16>>,         \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
215
                                    ops::grad_functor<plat::bfloat16>>);
216

L
Leo Chen 已提交
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(                                   \
    act_type, op_name, functor, grad_functor)                                  \
  REGISTER_OP_CUDA_KERNEL(                                                     \
      act_type,                                                                \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<float>>,         \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<double>>,        \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int>>,           \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<int64_t>>,       \
      ops::ActivationCudaKernel<phi::GPUContext, ops::functor<plat::float16>>, \
      ops::ActivationCudaKernel<phi::GPUContext,                               \
                                ops::functor<plat::bfloat16>>);                \
  REGISTER_OP_CUDA_KERNEL(                                                     \
      act_type##_grad,                                                         \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<float>>,                 \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<double>>,                \
      ops::ActivationGradCudaKernel<phi::GPUContext, ops::grad_functor<int>>,  \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<int64_t>>,               \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
                                    ops::grad_functor<plat::float16>>,         \
      ops::ActivationGradCudaKernel<phi::GPUContext,                           \
240
                                    ops::grad_functor<plat::bfloat16>>);
241

242
REGISTER_OP_CUDA_KERNEL(
243
    relu6,
L
Leo Chen 已提交
244 245 246 247 248
    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<float>>,
    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<double>>,
    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int>>,
    ops::ActivationCudaKernel<phi::GPUContext, ops::CudaRelu6Functor<int64_t>>,
    ops::ActivationCudaKernel<phi::GPUContext,
Y
YuanRisheng 已提交
249
                              ops::CudaRelu6Functor<plat::float16>>,
L
Leo Chen 已提交
250
    ops::ActivationCudaKernel<phi::GPUContext,
Y
YuanRisheng 已提交
251
                              ops::CudaRelu6Functor<plat::bfloat16>>);
W
whs 已提交
252
REGISTER_OP_CUDA_KERNEL(
253
    relu6_grad,
L
Leo Chen 已提交
254
    ops::ActivationGradCudaKernel<phi::GPUContext,
255
                                  ops::CudaRelu6GradFunctor<float>>,
L
Leo Chen 已提交
256
    ops::ActivationGradCudaKernel<phi::GPUContext,
Y
YuanRisheng 已提交
257
                                  ops::CudaRelu6GradFunctor<double>>,
L
Leo Chen 已提交
258
    ops::ActivationGradCudaKernel<phi::GPUContext,
Y
YuanRisheng 已提交
259
                                  ops::CudaRelu6GradFunctor<int>>,
L
Leo Chen 已提交
260
    ops::ActivationGradCudaKernel<phi::GPUContext,
Y
YuanRisheng 已提交
261
                                  ops::CudaRelu6GradFunctor<int64_t>>,
L
Leo Chen 已提交
262
    ops::ActivationGradCudaKernel<phi::GPUContext,
Y
YuanRisheng 已提交
263
                                  ops::CudaRelu6GradFunctor<plat::float16>>,
L
Leo Chen 已提交
264
    ops::ActivationGradCudaKernel<phi::GPUContext,
Y
YuanRisheng 已提交
265
                                  ops::CudaRelu6GradFunctor<plat::bfloat16>>);
R
ronnywang 已提交
266

267 268
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
269 270
  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);

271
FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
272 273

#ifdef PADDLE_WITH_XPU_KP
274
REGISTER_OP_KERNEL(
275 276 277
    brelu,
    KP,
    plat::XPUPlace,
278 279 280
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaBReluFunctor<float>>);
REGISTER_OP_KERNEL(
281 282 283
    brelu_grad,
    KP,
    plat::XPUPlace,
284 285 286
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaBReluGradFunctor<float>>);

287 288 289
REGISTER_OP_KERNEL(ceil,
                   KP,
                   plat::XPUPlace,
290 291 292
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaCeilFunctor<float>>);
REGISTER_OP_KERNEL(
293 294 295
    ceil_grad,
    KP,
    plat::XPUPlace,
296 297 298
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaZeroGradFunctor<float>>);

Y
YuanRisheng 已提交
299
REGISTER_OP_KERNEL(
300 301 302
    celu,
    KP,
    plat::XPUPlace,
Y
YuanRisheng 已提交
303 304
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaCELUFunctor<float>>);
305
REGISTER_OP_KERNEL(
306 307 308
    celu_grad,
    KP,
    plat::XPUPlace,
309
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
Y
YuanRisheng 已提交
310
                                  phi::funcs::CudaCELUGradFunctor<float>>);
311

312 313 314
REGISTER_OP_KERNEL(elu,
                   KP,
                   plat::XPUPlace,
315 316 317
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaELUFunctor<float>>);
REGISTER_OP_KERNEL(
318 319 320
    elu_grad,
    KP,
    plat::XPUPlace,
321 322 323
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaELUGradFunctor<float>>);

324 325 326
REGISTER_OP_KERNEL(exp,
                   KP,
                   plat::XPUPlace,
327 328 329
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaExpFunctor<float>>);
REGISTER_OP_KERNEL(
330 331 332
    exp_grad,
    KP,
    plat::XPUPlace,
333 334 335
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaExpGradFunctor<float>>);

336 337 338
REGISTER_OP_KERNEL(floor,
                   KP,
                   plat::XPUPlace,
339 340 341
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaFloorFunctor<float>>);
REGISTER_OP_KERNEL(
342 343 344
    floor_grad,
    KP,
    plat::XPUPlace,
345 346 347 348
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaZeroGradFunctor<float>>);

REGISTER_OP_KERNEL(
349 350 351
    hard_shrink,
    KP,
    plat::XPUPlace,
352 353 354
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaHardShrinkFunctor<float>>);
REGISTER_OP_KERNEL(
355 356 357
    hard_shrink_grad,
    KP,
    plat::XPUPlace,
358 359 360 361
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardShrinkGradFunctor<float>>);

REGISTER_OP_KERNEL(
362 363 364
    hard_sigmoid,
    KP,
    plat::XPUPlace,
365 366 367
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaHardSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
368 369 370
    hard_sigmoid_grad,
    KP,
    plat::XPUPlace,
371 372 373
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardSigmoidGradFunctor<float>>);

374 375 376
REGISTER_OP_KERNEL(hard_swish,
                   KP,
                   plat::XPUPlace,
377 378 379
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaHardSwishFunctor<float>>);
REGISTER_OP_KERNEL(
380 381 382
    hard_swish_grad,
    KP,
    plat::XPUPlace,
383 384 385 386
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardSwishGradFunctor<float>>);

REGISTER_OP_KERNEL(
387 388 389
    leaky_relu,
    KP,
    plat::XPUPlace,
390 391 392
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaLeakyReluFunctor<float>>);
REGISTER_OP_KERNEL(
393 394 395
    leaky_relu_grad,
    KP,
    plat::XPUPlace,
396 397 398
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaLeakyReluGradFunctor<float>>);

399 400 401
REGISTER_OP_KERNEL(log,
                   KP,
                   plat::XPUPlace,
402 403 404
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaLogFunctor<float>>);
REGISTER_OP_KERNEL(
405 406 407
    log_grad,
    KP,
    plat::XPUPlace,
408 409 410
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLogGradFunctor<float>>);

411 412 413
REGISTER_OP_KERNEL(log1p,
                   KP,
                   plat::XPUPlace,
414 415 416
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaLog1pFunctor<float>>);
REGISTER_OP_KERNEL(
417 418 419
    log1p_grad,
    KP,
    plat::XPUPlace,
420 421 422 423
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLog1pGradFunctor<float>>);

REGISTER_OP_KERNEL(
424 425 426
    logsigmoid,
    KP,
    plat::XPUPlace,
427 428 429
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaLogSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
430 431 432
    logsigmoid_grad,
    KP,
    plat::XPUPlace,
433 434 435 436
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLogSigmoidGradFunctor<float>>);

REGISTER_OP_KERNEL(
437 438 439
    reciprocal,
    KP,
    plat::XPUPlace,
440 441 442
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaReciprocalFunctor<float>>);
REGISTER_OP_KERNEL(
443 444 445
    reciprocal_grad,
    KP,
    plat::XPUPlace,
446 447 448 449
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaReciprocalGradFunctor<float>>);

REGISTER_OP_KERNEL(
450 451 452
    relu,
    KP,
    plat::XPUPlace,
453 454 455
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaReluFunctor<float>>);
REGISTER_OP_KERNEL(
456 457 458
    relu_grad,
    KP,
    plat::XPUPlace,
459 460 461
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaReluGradFunctor<float>>);

462 463 464
REGISTER_OP_KERNEL(relu6,
                   KP,
                   plat::XPUPlace,
465 466 467
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaRelu6Functor<float>>);
REGISTER_OP_KERNEL(
468 469 470
    relu6_grad,
    KP,
    plat::XPUPlace,
471 472 473
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaRelu6GradFunctor<float>>);

474 475 476
REGISTER_OP_KERNEL(sigmoid,
                   KP,
                   plat::XPUPlace,
477 478 479
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
480 481 482
    sigmoid_grad,
    KP,
    plat::XPUPlace,
483 484 485
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSigmoidGradFunctor<float>>);

486 487 488
REGISTER_OP_KERNEL(silu,
                   KP,
                   plat::XPUPlace,
489 490 491
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSiluFunctor<float>>);
REGISTER_OP_KERNEL(
492 493 494
    silu_grad,
    KP,
    plat::XPUPlace,
495 496 497
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSiluGradFunctor<float>>);

498 499 500
REGISTER_OP_KERNEL(soft_relu,
                   KP,
                   plat::XPUPlace,
501 502 503
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftReluFunctor<float>>);
REGISTER_OP_KERNEL(
504 505 506
    soft_relu_grad,
    KP,
    plat::XPUPlace,
507 508 509
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftReluGradFunctor<float>>);

510 511 512
REGISTER_OP_KERNEL(softplus,
                   KP,
                   plat::XPUPlace,
513 514 515
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftplusFunctor<float>>);
REGISTER_OP_KERNEL(
516 517 518
    softplus_grad,
    KP,
    plat::XPUPlace,
519 520 521 522
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftplusGradFunctor<float>>);

REGISTER_OP_KERNEL(
523 524 525
    softshrink,
    KP,
    plat::XPUPlace,
526 527 528
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaSoftShrinkFunctor<float>>);
REGISTER_OP_KERNEL(
529 530 531
    softshrink_grad,
    KP,
    plat::XPUPlace,
532 533 534
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftShrinkGradFunctor<float>>);

535 536 537
REGISTER_OP_KERNEL(softsign,
                   KP,
                   plat::XPUPlace,
538 539 540
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftsignFunctor<float>>);
REGISTER_OP_KERNEL(
541 542 543
    softsign_grad,
    KP,
    plat::XPUPlace,
544 545 546
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftsignGradFunctor<float>>);

547 548 549
REGISTER_OP_KERNEL(sqrt,
                   KP,
                   plat::XPUPlace,
550 551 552
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSqrtFunctor<float>>);
REGISTER_OP_KERNEL(
553 554 555
    sqrt_grad,
    KP,
    plat::XPUPlace,
556 557 558
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSqrtGradFunctor<float>>);

559 560 561
REGISTER_OP_KERNEL(square,
                   KP,
                   plat::XPUPlace,
562 563 564
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSquareFunctor<float>>);
REGISTER_OP_KERNEL(
565 566 567
    square_grad,
    KP,
    plat::XPUPlace,
568 569 570
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSquareGradFunctor<float>>);

571 572 573
REGISTER_OP_KERNEL(swish,
                   KP,
                   plat::XPUPlace,
574 575 576
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSwishFunctor<float>>);
REGISTER_OP_KERNEL(
577 578 579
    swish_grad,
    KP,
    plat::XPUPlace,
580 581 582 583
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSwishGradFunctor<float>>);

REGISTER_OP_KERNEL(
584 585 586
    thresholded_relu,
    KP,
    plat::XPUPlace,
587 588 589
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaThresholdedReluFunctor<float>>);
REGISTER_OP_KERNEL(
590 591 592
    thresholded_relu_grad,
    KP,
    plat::XPUPlace,
593 594
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaThresholdedReluGradFunctor<float>>);
595 596

#endif  // PADDLE_WITH_XPU_KP