unpooling.cu 9.6 KB
Newer Older
1
/* Copyright (c) 2022 paddlepaddle Authors. All Rights Reserved.
S
sweetsky0901 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

Y
Yi Wang 已提交
15
#include "paddle/fluid/operators/math/unpooling.h"
16
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
S
sweetsky0901 已提交
17 18 19 20

namespace paddle {
namespace operators {
namespace math {
S
sweetsky0901 已提交
21
template <typename T>
S
sweetsky0901 已提交
22
__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
S
sweetsky0901 已提交
23
                                  const int* indices_data,
S
sweetsky0901 已提交
24
                                  const int input_height, const int input_width,
S
sweetsky0901 已提交
25 26 27
                                  const int channels, T* output_data,
                                  const int output_height,
                                  const int output_width) {
28 29 30 31 32 33
  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
    int c = (linearIndex / input_width / input_height) % channels;
    int n = linearIndex / input_width / input_height / channels;
    output_data += (n * channels + c) * output_height * output_width;
    int maxind = indices_data[linearIndex];
    output_data[maxind] = input_data[linearIndex];
S
sweetsky0901 已提交
34
  }
S
sweetsky0901 已提交
35
}
36

S
sweetsky0901 已提交
37
template <typename T>
S
sweetsky0901 已提交
38 39 40 41 42
__global__ void KernelUnpool2dMaxGrad(
    const int nthreads, const T* input_data, const int* indices_data,
    const int input_height, const int input_width, const int channels,
    const T* output_data, const T* output_grad, const int output_height,
    const int output_width, T* input_grad) {
43 44 45 46 47 48
  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
    int c = (linearIndex / input_width / input_height) % channels;
    int n = linearIndex / input_width / input_height / channels;
    output_grad += (n * channels + c) * output_height * output_width;
    int maxind = indices_data[linearIndex];
    input_grad[linearIndex] = output_grad[maxind];
S
sweetsky0901 已提交
49
  }
S
sweetsky0901 已提交
50 51 52 53
}
/*
 * All tensors are in NCHW format.
 */
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92

template <typename T>
__global__ void KernelUnpool3dMax(const int nthreads, const T* input_data,
                                  const int* indices_data,
                                  const int input_depth, const int input_height,
                                  const int input_width, const int channels,
                                  T* output_data, const int output_depth,
                                  const int output_height,
                                  const int output_width) {
  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
    int c = (linearIndex / input_depth / input_width / input_height) % channels;
    int n = linearIndex / input_depth / input_width / input_height / channels;
    output_data +=
        (n * channels + c) * output_depth * output_height * output_width;
    int maxind = indices_data[linearIndex];
    output_data[maxind] = input_data[linearIndex];
  }
}

template <typename T>
__global__ void KernelUnpool3dMaxGrad(
    const int nthreads, const T* input_data, const int* indices_data,
    const int input_depth, const int input_height, const int input_width,
    const int channels, const T* output_data, const T* output_grad,
    const int output_depth, const int output_height, const int output_width,
    T* input_grad) {
  CUDA_KERNEL_LOOP(linearIndex, nthreads) {
    int c = (linearIndex / input_depth / input_width / input_height) % channels;
    int n = linearIndex / input_depth / input_width / input_height / channels;
    output_grad +=
        (n * channels + c) * output_depth * output_height * output_width;
    int maxind = indices_data[linearIndex];
    input_grad[linearIndex] = output_grad[maxind];
  }
}
/*
 * All tensors are in NCDHW format.
 */

S
sweetsky0901 已提交
93
template <typename T>
Q
QI JUN 已提交
94
class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
S
sweetsky0901 已提交
95
 public:
Q
QI JUN 已提交
96
  void operator()(const platform::CUDADeviceContext& context,
S
sweetsky0901 已提交
97 98
                  const framework::Tensor& input,
                  const framework::Tensor& indices, framework::Tensor* output) {
S
sweetsky0901 已提交
99 100 101 102 103 104 105
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
    const int output_channels = output->dims()[1];
    const int output_height = output->dims()[2];
    const int output_width = output->dims()[3];
    const T* input_data = input.data<T>();
S
sweetsky0901 已提交
106
    const int* indices_data = indices.data<int>();
S
sweetsky0901 已提交
107
    T* output_data = output->mutable_data<T>(context.GetPlace());
R
ronnywang 已提交
108 109 110
#ifdef __HIPCC__
    int threads = 256;
#else
111
    int threads = 1024;
R
ronnywang 已提交
112
#endif
S
sweetsky0901 已提交
113
    int grid = (input.numel() + threads - 1) / threads;
Q
QI JUN 已提交
114 115 116
    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
        input.numel(), input_data, indices_data, input_height, input_width,
        output_channels, output_data, output_height, output_width);
S
sweetsky0901 已提交
117 118 119 120 121
  }
};
/*
 * All tensors are in NCHW format.
 */
S
sweetsky0901 已提交
122
template <typename T>
Q
QI JUN 已提交
123
class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
S
sweetsky0901 已提交
124
 public:
Q
QI JUN 已提交
125
  void operator()(const platform::CUDADeviceContext& context,
S
sweetsky0901 已提交
126
                  const framework::Tensor& input,
S
sweetsky0901 已提交
127
                  const framework::Tensor& indices,
S
sweetsky0901 已提交
128
                  const framework::Tensor& output,
S
sweetsky0901 已提交
129
                  const framework::Tensor& output_grad,
S
sweetsky0901 已提交
130
                  framework::Tensor* input_grad) {
S
sweetsky0901 已提交
131 132 133 134 135 136 137
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
    const int output_channels = output.dims()[1];
    const int output_height = output.dims()[2];
    const int output_width = output.dims()[3];
    const T* input_data = input.data<T>();
S
sweetsky0901 已提交
138
    const int* indices_data = indices.data<int>();
S
sweetsky0901 已提交
139 140 141
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
R
ronnywang 已提交
142 143 144
#ifdef __HIPCC__
    int threads = 256;
#else
145
    int threads = 1024;
R
ronnywang 已提交
146
#endif
S
sweetsky0901 已提交
147
    int grid = (input.numel() + threads - 1) / threads;
Q
QI JUN 已提交
148 149 150 151
    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
        input.numel(), input_data, indices_data, input_height, input_width,
        output_channels, output_data, output_grad_data, output_height,
        output_width, input_grad_data);
S
sweetsky0901 已提交
152 153
  }
};
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221

template <typename T>
class Unpool3dMaxFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input,
                  const framework::Tensor& indices, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
    const int output_channels = output->dims()[1];
    const int output_depth = output->dims()[2];
    const int output_height = output->dims()[3];
    const int output_width = output->dims()[4];
    const T* input_data = input.data<T>();
    const int* indices_data = indices.data<int>();
    T* output_data = output->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
    int threads = 256;
#else
    int threads = 1024;
#endif
    int grid = (input.numel() + threads - 1) / threads;
    KernelUnpool3dMax<T><<<grid, threads, 0, context.stream()>>>(
        input.numel(), input_data, indices_data, input_depth, input_height,
        input_width, output_channels, output_data, output_depth, output_height,
        output_width);
  }
};
/*
 * All tensors are in NCDHW format.
 */
template <typename T>
class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input,
                  const framework::Tensor& indices,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad,
                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
    const int output_channels = output.dims()[1];
    const int output_depth = output.dims()[2];
    const int output_height = output.dims()[3];
    const int output_width = output.dims()[4];
    const T* input_data = input.data<T>();
    const int* indices_data = indices.data<int>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
#ifdef __HIPCC__
    int threads = 256;
#else
    int threads = 1024;
#endif
    int grid = (input.numel() + threads - 1) / threads;
    KernelUnpool3dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
        input.numel(), input_data, indices_data, input_depth, input_height,
        input_width, output_channels, output_data, output_grad_data,
        output_depth, output_height, output_width, input_grad_data);
  }
};

Q
QI JUN 已提交
222 223 224 225
template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
226 227 228 229
template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, float>;
template class Unpool3dMaxGradFunctor<platform::CUDADeviceContext, double>;
template class Unpool3dMaxFunctor<platform::CUDADeviceContext, float>;
template class Unpool3dMaxFunctor<platform::CUDADeviceContext, double>;
S
sweetsky0901 已提交
230 231 232
}  // namespace math
}  // namespace operators
}  // namespace paddle