activation_op.kps 23.3 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
L
Luo Tao 已提交
2 3 4 5 6 7 8 9 10
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
L
liaogang 已提交
11

Y
Yi Wang 已提交
12
#include "paddle/fluid/operators/activation_op.h"
13 14
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
15
#include "paddle/fluid/platform/bfloat16.h"
16
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
17 18
#include "paddle/phi/kernels/funcs/activation_functor.h"

19 20 21
namespace paddle {
namespace operators {

22 23 24 25 26 27 28 29 30 31 32 33
template <typename T>
struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  MPType one = static_cast<MPType>(1.0f);
  float threshold;

  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }

  // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
  // threshold should not be negative
34
  __device__ __forceinline__ T operator()(const T arg_x) const {
35
    MPType x = static_cast<MPType>(arg_x);
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
    MPType t = static_cast<MPType>(threshold);
    MPType temp_min = x < t ? x : t;
    MPType temp_max = temp_min > -t ? temp_min : -t;
    return static_cast<T>(log(one + exp(temp_max)));
  }
};

template <typename T>
struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
  MPType one = static_cast<MPType>(1.0f);
  float threshold;

  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
    return {{"threshold", &threshold}};
  }

  // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
  // threshold should not be negative
55 56
  __device__ __forceinline__ T operator()(const T arg_dout,
                                          const T arg_out) const {
57 58
    MPType dout = static_cast<MPType>(arg_dout);
    MPType out = static_cast<MPType>(arg_out);
59 60 61 62 63
    MPType t = static_cast<MPType>(threshold);
    return (out > -t && out < t) ? static_cast<T>(dout * (one - exp(-out)))
                                 : static_cast<T>(0.0f);
  }

64 65 66
  static constexpr ActBwdOpFwdDeps FwdDeps() {
    return ActBwdOpFwdDeps::kDepOut;
  }
67 68
};

69
template <typename DeviceContext, typename Functor>
70
class ActivationCudaKernel
71 72 73
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
  using T = typename Functor::ELEMENT_TYPE;
74 75
  void Compute(const framework::ExecutionContext& ctx) const override {
    const framework::Tensor* x = nullptr;
76
    framework::Tensor* out = nullptr;
77 78 79 80 81 82
    ExtractActivationTensor(ctx, &x, &out);
    out->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    std::vector<const framework::Tensor*> ins = {x};
    std::vector<framework::Tensor*> outs = {out};
    auto functor = Functor();
83 84
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
85
      *attr.second = ctx.Attr<float>(attr.first);
86
    }
87 88
    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
        dev_ctx, ins, &outs, functor);
89 90 91 92
  }
};

template <typename DeviceContext, typename Functor>
93
class ActivationGradCudaKernel
94 95 96
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
 public:
  using T = typename Functor::ELEMENT_TYPE;
97
  void Compute(const framework::ExecutionContext& ctx) const override {
98 99 100
    const framework::Tensor *x, *out, *d_out;
    framework::Tensor* d_x = nullptr;
    x = out = d_out = nullptr;
101 102
    ExtractActivationGradTensor<Functor::FwdDeps()>(
        ctx, &x, &out, &d_out, &d_x);
103 104 105 106 107 108 109 110 111 112
    d_x->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto functor = Functor();
    auto attrs = functor.GetAttrs();
    for (auto& attr : attrs) {
      *attr.second = ctx.Attr<float>(attr.first);
    }

    std::vector<const framework::Tensor*> ins = {d_out};
    std::vector<framework::Tensor*> outs = {d_x};
113

114 115
    if (static_cast<int>(Functor::FwdDeps()) ==
        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
116
      // Only need forward output Out
117
      ins.push_back(out);
118 119
      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
          dev_ctx, ins, &outs, functor);
120
    } else if (static_cast<int>(Functor::FwdDeps()) ==
121
               static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
122
      // Only need forward input X
123
      ins.push_back(x);
124 125
      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
          dev_ctx, ins, &outs, functor);
126
    } else {
127 128
      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
          dev_ctx, ins, &outs, functor);
129 130 131 132
    }
  }
};

133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
USE_PHI_FUNCTOR(CudaCos)
USE_PHI_FUNCTOR(CudaTan)
USE_PHI_FUNCTOR(CudaAcos)
USE_PHI_FUNCTOR(CudaSin)
USE_PHI_FUNCTOR(CudaAsin)
USE_PHI_FUNCTOR(CudaAtan)
USE_PHI_FUNCTOR(CudaSinh)
USE_PHI_FUNCTOR(CudaCosh)
USE_PHI_FUNCTOR(CudaAsinh)
USE_PHI_FUNCTOR(CudaAcosh)
USE_PHI_FUNCTOR(CudaAtanh)
USE_PHI_FUNCTOR(CudaTanh)
USE_PHI_FUNCTOR(CudaBRelu)
USE_PHI_FUNCTOR(CudaLeakyRelu)
USE_PHI_FUNCTOR(CudaThresholdedRelu)
148
USE_PHI_FUNCTOR(CudaRelu6)
Y
YuanRisheng 已提交
149 150 151 152 153
USE_PHI_FUNCTOR(CudaHardShrink)
USE_PHI_FUNCTOR(CudaSoftShrink)
USE_PHI_FUNCTOR(CudaTanhShrink)
USE_PHI_FUNCTOR(CudaSilu)
USE_PHI_FUNCTOR(CudaELU)
154
USE_PHI_FUNCTOR(CudaSoftsign)
Y
YuanRisheng 已提交
155 156 157
USE_PHI_FUNCTOR(CudaSigmoid)
USE_PHI_FUNCTOR(CudaLogSigmoid)
USE_PHI_FUNCTOR(CudaHardSigmoid)
158 159 160 161
USE_PHI_FUNCTOR(CudaLog)
USE_PHI_FUNCTOR(CudaLog2)
USE_PHI_FUNCTOR(CudaLog10)
USE_PHI_FUNCTOR(CudaLog1p)
Y
YuanRisheng 已提交
162 163 164 165 166 167 168 169 170 171 172 173 174 175
USE_PHI_FUNCTOR(CudaSwish)
USE_PHI_FUNCTOR(CudaHardSwish)

template <typename T>
using CudaRoundFunctor = phi::funcs::CudaRoundFunctor<T>;

template <typename T>
using CudaFloorFunctor = phi::funcs::CudaFloorFunctor<T>;

template <typename T>
using CudaCeilFunctor = phi::funcs::CudaCeilFunctor<T>;

template <typename T>
using CudaZeroGradFunctor = phi::funcs::CudaZeroGradFunctor<T>;
Y
YuanRisheng 已提交
176

177 178 179 180 181 182 183 184 185 186
USE_PHI_FUNCTOR(CudaExp)
USE_PHI_FUNCTOR(CudaExpm1)
USE_PHI_FUNCTOR(CudaMish)
USE_PHI_FUNCTOR(CudaSTanh)
USE_PHI_FUNCTOR(CudaReciprocal)
USE_PHI_FUNCTOR(CudaSquare)
USE_PHI_FUNCTOR(CudaSqrt)
USE_PHI_FUNCTOR(CudaRsqrt)
USE_PHI_FUNCTOR(CudaSoftplus)

Y
YuanRisheng 已提交
187 188 189
template <typename T>
using CudaELUGradNegativeAlphaFunctor =
    phi::funcs::CudaELUGradNegativeAlphaFunctor<T>;
190

191 192 193
}  // namespace operators
}  // namespace paddle

194
namespace ops = paddle::operators;
195 196
namespace plat = paddle::platform;

197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
#define REGISTER_ACTIVATION_CUDA_KERNEL(                               \
    act_type, op_name, functor, grad_functor)                          \
  REGISTER_OP_CUDA_KERNEL(                                             \
      act_type,                                                        \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<float>>,                  \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<double>>,                 \
      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
                                ops::functor<plat::float16>>,          \
      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
                                ops::functor<plat::bfloat16>>);        \
  REGISTER_OP_CUDA_KERNEL(                                             \
      act_type##_grad,                                                 \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<float>>,         \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<double>>,        \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<plat::float16>>, \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
218
                                    ops::grad_functor<plat::bfloat16>>);
219

220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(                           \
    act_type, op_name, functor, grad_functor)                          \
  REGISTER_OP_CUDA_KERNEL(                                             \
      act_type,                                                        \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<float>>,                  \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<double>>,                 \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<int>>,                    \
      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,   \
                                ops::functor<int64_t>>,                \
      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
                                ops::functor<plat::float16>>,          \
      ops::ActivationCudaKernel<plat::CUDADeviceContext,               \
                                ops::functor<plat::bfloat16>>);        \
  REGISTER_OP_CUDA_KERNEL(                                             \
      act_type##_grad,                                                 \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<float>>,         \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<double>>,        \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<int>>,           \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<int64_t>>,       \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
                                    ops::grad_functor<plat::float16>>, \
      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,           \
249
                                    ops::grad_functor<plat::bfloat16>>);
250

251
REGISTER_OP_CUDA_KERNEL(
252 253 254
    relu6,
    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
                              ops::CudaRelu6Functor<float>>,
Y
YuanRisheng 已提交
255 256 257 258 259 260 261 262 263 264
    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
                              ops::CudaRelu6Functor<double>>,
    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
                              ops::CudaRelu6Functor<int>>,
    ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,
                              ops::CudaRelu6Functor<int64_t>>,
    ops::ActivationCudaKernel<plat::CUDADeviceContext,
                              ops::CudaRelu6Functor<plat::float16>>,
    ops::ActivationCudaKernel<plat::CUDADeviceContext,
                              ops::CudaRelu6Functor<plat::bfloat16>>);
W
whs 已提交
265
REGISTER_OP_CUDA_KERNEL(
266 267 268
    relu6_grad,
    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<float>>,
Y
YuanRisheng 已提交
269 270 271 272 273 274 275 276 277 278
    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<double>>,
    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<int>>,
    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<int64_t>>,
    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<plat::float16>>,
    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
                                  ops::CudaRelu6GradFunctor<plat::bfloat16>>);
R
ronnywang 已提交
279

280 281
#define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
282 283
  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);

284
FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
285 286

#ifdef PADDLE_WITH_XPU_KP
287
REGISTER_OP_KERNEL(
288 289 290
    brelu,
    KP,
    plat::XPUPlace,
291 292 293
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaBReluFunctor<float>>);
REGISTER_OP_KERNEL(
294 295 296
    brelu_grad,
    KP,
    plat::XPUPlace,
297 298 299
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaBReluGradFunctor<float>>);

300 301 302
REGISTER_OP_KERNEL(ceil,
                   KP,
                   plat::XPUPlace,
303 304 305
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaCeilFunctor<float>>);
REGISTER_OP_KERNEL(
306 307 308
    ceil_grad,
    KP,
    plat::XPUPlace,
309 310 311
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaZeroGradFunctor<float>>);

Y
YuanRisheng 已提交
312
REGISTER_OP_KERNEL(
313 314 315
    celu,
    KP,
    plat::XPUPlace,
Y
YuanRisheng 已提交
316 317
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaCELUFunctor<float>>);
318
REGISTER_OP_KERNEL(
319 320 321
    celu_grad,
    KP,
    plat::XPUPlace,
322
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
Y
YuanRisheng 已提交
323
                                  phi::funcs::CudaCELUGradFunctor<float>>);
324

325 326 327
REGISTER_OP_KERNEL(elu,
                   KP,
                   plat::XPUPlace,
328 329 330
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaELUFunctor<float>>);
REGISTER_OP_KERNEL(
331 332 333
    elu_grad,
    KP,
    plat::XPUPlace,
334 335 336
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaELUGradFunctor<float>>);

337 338 339
REGISTER_OP_KERNEL(exp,
                   KP,
                   plat::XPUPlace,
340 341 342
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaExpFunctor<float>>);
REGISTER_OP_KERNEL(
343 344 345
    exp_grad,
    KP,
    plat::XPUPlace,
346 347 348
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaExpGradFunctor<float>>);

349 350 351
REGISTER_OP_KERNEL(floor,
                   KP,
                   plat::XPUPlace,
352 353 354
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaFloorFunctor<float>>);
REGISTER_OP_KERNEL(
355 356 357
    floor_grad,
    KP,
    plat::XPUPlace,
358 359 360 361
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaZeroGradFunctor<float>>);

REGISTER_OP_KERNEL(
362 363 364
    hard_shrink,
    KP,
    plat::XPUPlace,
365 366 367
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaHardShrinkFunctor<float>>);
REGISTER_OP_KERNEL(
368 369 370
    hard_shrink_grad,
    KP,
    plat::XPUPlace,
371 372 373 374
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardShrinkGradFunctor<float>>);

REGISTER_OP_KERNEL(
375 376 377
    hard_sigmoid,
    KP,
    plat::XPUPlace,
378 379 380
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaHardSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
381 382 383
    hard_sigmoid_grad,
    KP,
    plat::XPUPlace,
384 385 386
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardSigmoidGradFunctor<float>>);

387 388 389
REGISTER_OP_KERNEL(hard_swish,
                   KP,
                   plat::XPUPlace,
390 391 392
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaHardSwishFunctor<float>>);
REGISTER_OP_KERNEL(
393 394 395
    hard_swish_grad,
    KP,
    plat::XPUPlace,
396 397 398 399
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaHardSwishGradFunctor<float>>);

REGISTER_OP_KERNEL(
400 401 402
    leaky_relu,
    KP,
    plat::XPUPlace,
403 404 405
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaLeakyReluFunctor<float>>);
REGISTER_OP_KERNEL(
406 407 408
    leaky_relu_grad,
    KP,
    plat::XPUPlace,
409 410 411
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaLeakyReluGradFunctor<float>>);

412 413 414
REGISTER_OP_KERNEL(log,
                   KP,
                   plat::XPUPlace,
415 416 417
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaLogFunctor<float>>);
REGISTER_OP_KERNEL(
418 419 420
    log_grad,
    KP,
    plat::XPUPlace,
421 422 423
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLogGradFunctor<float>>);

424 425 426
REGISTER_OP_KERNEL(log1p,
                   KP,
                   plat::XPUPlace,
427 428 429
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaLog1pFunctor<float>>);
REGISTER_OP_KERNEL(
430 431 432
    log1p_grad,
    KP,
    plat::XPUPlace,
433 434 435 436
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLog1pGradFunctor<float>>);

REGISTER_OP_KERNEL(
437 438 439
    logsigmoid,
    KP,
    plat::XPUPlace,
440 441 442
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaLogSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
443 444 445
    logsigmoid_grad,
    KP,
    plat::XPUPlace,
446 447 448 449
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaLogSigmoidGradFunctor<float>>);

REGISTER_OP_KERNEL(
450 451 452
    reciprocal,
    KP,
    plat::XPUPlace,
453 454 455
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaReciprocalFunctor<float>>);
REGISTER_OP_KERNEL(
456 457 458
    reciprocal_grad,
    KP,
    plat::XPUPlace,
459 460 461 462
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaReciprocalGradFunctor<float>>);

REGISTER_OP_KERNEL(
463 464 465
    relu,
    KP,
    plat::XPUPlace,
466 467 468
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              phi::funcs::CudaReluFunctor<float>>);
REGISTER_OP_KERNEL(
469 470 471
    relu_grad,
    KP,
    plat::XPUPlace,
472 473 474
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  phi::funcs::CudaReluGradFunctor<float>>);

475 476 477
REGISTER_OP_KERNEL(relu6,
                   KP,
                   plat::XPUPlace,
478 479 480
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaRelu6Functor<float>>);
REGISTER_OP_KERNEL(
481 482 483
    relu6_grad,
    KP,
    plat::XPUPlace,
484 485 486
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaRelu6GradFunctor<float>>);

487 488 489
REGISTER_OP_KERNEL(sigmoid,
                   KP,
                   plat::XPUPlace,
490 491 492
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSigmoidFunctor<float>>);
REGISTER_OP_KERNEL(
493 494 495
    sigmoid_grad,
    KP,
    plat::XPUPlace,
496 497 498
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSigmoidGradFunctor<float>>);

499 500 501
REGISTER_OP_KERNEL(silu,
                   KP,
                   plat::XPUPlace,
502 503 504
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSiluFunctor<float>>);
REGISTER_OP_KERNEL(
505 506 507
    silu_grad,
    KP,
    plat::XPUPlace,
508 509 510
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSiluGradFunctor<float>>);

511 512 513
REGISTER_OP_KERNEL(soft_relu,
                   KP,
                   plat::XPUPlace,
514 515 516
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftReluFunctor<float>>);
REGISTER_OP_KERNEL(
517 518 519
    soft_relu_grad,
    KP,
    plat::XPUPlace,
520 521 522
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftReluGradFunctor<float>>);

523 524 525
REGISTER_OP_KERNEL(softplus,
                   KP,
                   plat::XPUPlace,
526 527 528
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftplusFunctor<float>>);
REGISTER_OP_KERNEL(
529 530 531
    softplus_grad,
    KP,
    plat::XPUPlace,
532 533 534 535
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftplusGradFunctor<float>>);

REGISTER_OP_KERNEL(
536 537 538
    softshrink,
    KP,
    plat::XPUPlace,
539 540 541
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaSoftShrinkFunctor<float>>);
REGISTER_OP_KERNEL(
542 543 544
    softshrink_grad,
    KP,
    plat::XPUPlace,
545 546 547
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftShrinkGradFunctor<float>>);

548 549 550
REGISTER_OP_KERNEL(softsign,
                   KP,
                   plat::XPUPlace,
551 552 553
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSoftsignFunctor<float>>);
REGISTER_OP_KERNEL(
554 555 556
    softsign_grad,
    KP,
    plat::XPUPlace,
557 558 559
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSoftsignGradFunctor<float>>);

560 561 562
REGISTER_OP_KERNEL(sqrt,
                   KP,
                   plat::XPUPlace,
563 564 565
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSqrtFunctor<float>>);
REGISTER_OP_KERNEL(
566 567 568
    sqrt_grad,
    KP,
    plat::XPUPlace,
569 570 571
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSqrtGradFunctor<float>>);

572 573 574
REGISTER_OP_KERNEL(square,
                   KP,
                   plat::XPUPlace,
575 576 577
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSquareFunctor<float>>);
REGISTER_OP_KERNEL(
578 579 580
    square_grad,
    KP,
    plat::XPUPlace,
581 582 583
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSquareGradFunctor<float>>);

584 585 586
REGISTER_OP_KERNEL(swish,
                   KP,
                   plat::XPUPlace,
587 588 589
                   ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                                             ops::CudaSwishFunctor<float>>);
REGISTER_OP_KERNEL(
590 591 592
    swish_grad,
    KP,
    plat::XPUPlace,
593 594 595 596
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaSwishGradFunctor<float>>);

REGISTER_OP_KERNEL(
597 598 599
    thresholded_relu,
    KP,
    plat::XPUPlace,
600 601 602
    ops::ActivationCudaKernel<paddle::platform::XPUDeviceContext,
                              ops::CudaThresholdedReluFunctor<float>>);
REGISTER_OP_KERNEL(
603 604 605
    thresholded_relu_grad,
    KP,
    plat::XPUPlace,
606 607
    ops::ActivationGradCudaKernel<paddle::platform::XPUDeviceContext,
                                  ops::CudaThresholdedReluGradFunctor<float>>);
608 609

#endif  // PADDLE_WITH_XPU_KP