prelu.cu 4.7 KB
Newer Older
N
nhzlx 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/math/prelu.h"

namespace paddle {
namespace operators {
namespace math {

21 22 23
#define CUDA_NUM_THREADS 1024

inline static int PADDLE_GET_BLOCKS(const int N) {
N
nhzlx 已提交
24 25 26 27
  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}

template <typename T>
28 29 30
__global__ void PReluChannelFirstWiseKernel(const T *input, const T *alpha,
                                            T *output, size_t channel_num,
                                            size_t plane_size, size_t numel) {
31 32 33 34 35
  CUDA_KERNEL_LOOP(index, numel) {
    size_t temp = index / plane_size;
    size_t channel_index = temp % channel_num;
    T scale = alpha[channel_index];
    T x = input[index];
C
cc 已提交
36 37
    T zero = static_cast<T>(0);
    output[index] = (x > zero) ? x : scale * x;
N
nhzlx 已提交
38 39 40
  }
}

41 42 43 44 45 46 47 48 49 50 51 52 53
template <typename T>
__global__ void PReluChannelLastWiseKernel(const T *input, const T *alpha,
                                           T *output, size_t channel_num,
                                           size_t numel) {
  CUDA_KERNEL_LOOP(index, numel) {
    size_t channel_index = index % channel_num;
    T scale = alpha[channel_index];
    T x = input[index];
    T zero = static_cast<T>(0);
    output[index] = (x > zero) ? x : scale * x;
  }
}

N
nhzlx 已提交
54 55
template <typename T>
__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
56 57 58 59 60 61
                                       T *output, size_t spatial_size,
                                       size_t numel) {
  CUDA_KERNEL_LOOP(index, numel) {
    size_t element_index = index % spatial_size;
    T scale = alpha[element_index];
    T x = input[index];
C
cc 已提交
62 63
    T zero = static_cast<T>(0);
    output[index] = (x > zero) ? x : scale * x;
N
nhzlx 已提交
64 65 66 67 68
  }
}

template <typename T>
__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
69 70 71 72
                                  size_t numel) {
  T scale = alpha[0];
  CUDA_KERNEL_LOOP(index, numel) {
    T x = input[index];
C
cc 已提交
73 74
    T zero = static_cast<T>(0);
    output[index] = (x > zero) ? x : scale * x;
N
nhzlx 已提交
75 76 77 78 79
  }
}

template <typename T>
void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
80
    gpuStream_t stream, const T *input, const T *alpha, T *output,
81 82 83 84 85 86 87 88 89 90
    size_t batch_size, size_t channel, bool channel_last, size_t numel) {
  if (channel_last) {
    PReluChannelLastWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
                                 stream>>>(input, alpha, output, channel,
                                           numel);
  } else {
    PReluChannelFirstWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
                                  stream>>>(
        input, alpha, output, channel, numel / batch_size / channel, numel);
  }
N
nhzlx 已提交
91 92 93
}

template <typename T>
94
void PreluElementWiseDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
95 96 97 98
                                                      const T *input,
                                                      const T *alpha, T *output,
                                                      size_t batch_size,
                                                      size_t numel) {
99
  PReluElementWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
100 101
                           stream>>>(input, alpha, output, numel / batch_size,
                                     numel);
N
nhzlx 已提交
102 103 104
}

template <typename T>
105
void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
N
nhzlx 已提交
106
                                                 const T *input, const T *alpha,
107
                                                 T *output, size_t numel) {
108 109
  PReluScalarKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
      input, alpha, output, numel);
N
nhzlx 已提交
110 111 112
}

template class PreluChannelWiseDirectCUDAFunctor<float>;
C
cc 已提交
113
template class PreluChannelWiseDirectCUDAFunctor<paddle::platform::float16>;
N
nhzlx 已提交
114 115 116
template class PreluChannelWiseDirectCUDAFunctor<double>;

template class PreluElementWiseDirectCUDAFunctor<float>;
C
cc 已提交
117
template class PreluElementWiseDirectCUDAFunctor<paddle::platform::float16>;
N
nhzlx 已提交
118 119 120
template class PreluElementWiseDirectCUDAFunctor<double>;

template class PreluScalarDirectCUDAFunctor<float>;
C
cc 已提交
121
template class PreluScalarDirectCUDAFunctor<paddle::platform::float16>;
N
nhzlx 已提交
122 123 124 125 126
template class PreluScalarDirectCUDAFunctor<double>;

}  // namespace math
}  // namespace operators
}  // namespace paddle