conv_transpose_cudnn_op.cu.cc 12.0 KB
Newer Older
1
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Z
zchen0211 已提交
2

L
Luo Tao 已提交
3 4 5
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
Z
zchen0211 已提交
6

L
Luo Tao 已提交
7
    http://www.apache.org/licenses/LICENSE-2.0
Z
zchen0211 已提交
8

L
Luo Tao 已提交
9 10 11 12 13
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
Z
zchen0211 已提交
14

Y
Yi Wang 已提交
15 16 17 18 19 20
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/platform/assert.h"
#include "paddle/fluid/platform/cudnn_helper.h"
Z
zchen0211 已提交
21 22 23 24 25 26 27 28 29 30

namespace paddle {
namespace operators {

using Tensor = framework::Tensor;
using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
using DataLayout = platform::DataLayout;

31
static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024;
Z
zchen0211 已提交
32 33

template <typename T>
34
class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
Z
zchen0211 已提交
35 36 37
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
D
dzhwinter 已提交
38
                   "It must use CUDAPlace.");
Z
zchen0211 已提交
39 40 41 42 43 44
    auto* input = ctx.Input<Tensor>("Input");
    auto* filter = ctx.Input<Tensor>("Filter");
    auto* output = ctx.Output<Tensor>("Output");

    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
Z
zchen0211 已提交
45
    // cudnn v5 does not support dilations
Z
zchen0211 已提交
46
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
47
    int groups = ctx.Attr<int>("groups");
Z
zchen0211 已提交
48 49 50 51 52 53 54 55 56 57
    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");

    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
    // ------------------- cudnn descriptors ---------------------
    ScopedTensorDescriptor input_desc;
    ScopedTensorDescriptor output_desc;
    ScopedFilterDescriptor filter_desc;
    ScopedConvolutionDescriptor conv_desc;
C
chengduoZH 已提交
58 59 60 61 62 63 64
    DataLayout layout;

    if (strides.size() == 2U) {
      layout = DataLayout::kNCHW;
    } else {
      layout = DataLayout::kNCDHW;
    }
Z
zchen0211 已提交
65

C
chengduoZH 已提交
66
    // (N, M, H, W) or (N, M, D, H, W)
Z
zchen0211 已提交
67
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
68
        layout, framework::vectorize2int(input->dims()), groups);
C
chengduoZH 已提交
69
    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
Z
zchen0211 已提交
70
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
71
        layout, framework::vectorize2int(output->dims()), groups);
C
chengduoZH 已提交
72
    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
Z
zchen0211 已提交
73
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
74
        layout, framework::vectorize2int(filter->dims()), groups);
Z
zchen0211 已提交
75 76 77 78 79
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);

    // ------------------- cudnn conv workspace ---------------------
    size_t workspace_size_in_bytes;  // final workspace to allocate.
80
    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
Z
zchen0211 已提交
81 82 83 84
    if (user_workspace_size > 0) {
      workspace_size_limit = user_workspace_size * 1024 * 1024;
    }
    // ------------------- cudnn conv algorithm ---------------------
Z
zchen0211 已提交
85
    cudnnConvolutionBwdDataAlgo_t algo;
Q
QI JUN 已提交
86 87
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
Z
zchen0211 已提交
88
    // Get the algorithm
W
Wu Yi 已提交
89
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
Z
zchen0211 已提交
90 91 92 93 94 95 96
        handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
        // dxDesc: Handle to the previously initialized output tensor
        // descriptor.
        cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
        workspace_size_limit, &algo));

    // get workspace size able to allocate
W
Wu Yi 已提交
97
    CUDNN_ENFORCE(
Z
zchen0211 已提交
98 99
        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
Z
zchen0211 已提交
100
            cudnn_output_desc, algo, &workspace_size_in_bytes));
Z
zchen0211 已提交
101 102

    // ------------------- cudnn conv transpose forward ---------------------
103 104 105
    int input_offset = input->numel() / input->dims()[0] / groups;
    int output_offset = output->numel() / output->dims()[0] / groups;
    int filter_offset = filter->numel() / groups;
Z
zchen0211 已提交
106
    T alpha = 1.0f, beta = 0.0f;
C
chengduo 已提交
107 108 109 110 111 112

    auto temp_allocation =
        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
            workspace_size_in_bytes);
    void* cudnn_workspace = temp_allocation->ptr();

113
    for (int g = 0; g < groups; g++) {
C
chengduo 已提交
114 115 116 117 118
      CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
          handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g,
          cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc,
          algo, cudnn_workspace, workspace_size_in_bytes, &beta,
          cudnn_output_desc, output_data + output_offset * g));
119
    }
Z
zchen0211 已提交
120 121 122 123
  }
};

template <typename T>
124
class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
Z
zchen0211 已提交
125 126 127
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
D
dzhwinter 已提交
128
                   "It must use CUDAPlace.");
Z
zchen0211 已提交
129 130 131 132 133 134 135 136 137 138 139
    auto input = ctx.Input<Tensor>("Input");
    auto filter = ctx.Input<Tensor>("Filter");
    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
    const T* input_data = input->data<T>();
    const T* output_grad_data = output_grad->data<T>();
    const T* filter_data = filter->data<T>();

    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
Z
zchen0211 已提交
140
    // cudnn v5 does not support dilations
Z
zchen0211 已提交
141
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
142
    int groups = ctx.Attr<int>("groups");
Z
zchen0211 已提交
143 144 145 146
    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");

    // ------------------- cudnn descriptors ---------------------
    ScopedTensorDescriptor input_desc;
Z
zchen0211 已提交
147
    ScopedTensorDescriptor output_desc;
Z
zchen0211 已提交
148 149 150 151
    ScopedFilterDescriptor filter_desc;
    ScopedConvolutionDescriptor conv_desc;
    DataLayout layout = DataLayout::kNCHW;

C
chengduoZH 已提交
152
    // Input: (N, M, H, W) or (N, M, D, H, W)
Z
zchen0211 已提交
153
    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
154
        layout, framework::vectorize2int(input->dims()), groups);
C
chengduoZH 已提交
155
    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
Z
zchen0211 已提交
156
    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
157
        layout, framework::vectorize2int(output_grad->dims()), groups);
C
chengduoZH 已提交
158
    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
Z
zchen0211 已提交
159
    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
160
        layout, framework::vectorize2int(filter->dims()), groups);
Z
zchen0211 已提交
161 162 163 164 165

    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);

    // ------------------- cudnn backward algorithm ---------------------
Z
zchen0211 已提交
166
    cudnnConvolutionFwdAlgo_t data_algo;
Z
zchen0211 已提交
167
    cudnnConvolutionBwdFilterAlgo_t filter_algo;
Z
zchen0211 已提交
168 169
    size_t bwd_filter_ws_size, fwd_ws_size;
    size_t workspace_size_in_bytes = 0;
170
    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
Z
zchen0211 已提交
171 172 173 174
    if (user_workspace_size > 0) {
      workspace_size_limit = user_workspace_size * 1024 * 1024;
    }

Q
QI JUN 已提交
175 176
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
Z
zchen0211 已提交
177
    if (input_grad) {
Z
zchen0211 已提交
178
      // choose backward algorithm for data
W
Wu Yi 已提交
179
      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
Z
zchen0211 已提交
180 181 182
          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
          cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
          workspace_size_limit, &data_algo));
W
Wu Yi 已提交
183
      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
Z
zchen0211 已提交
184 185 186
          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
          cudnn_input_desc, data_algo, &fwd_ws_size));
      workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
Z
zchen0211 已提交
187 188 189
    }

    if (filter_grad) {
Z
zchen0211 已提交
190
      // choose backward algorithm for filter
W
Wu Yi 已提交
191
      CUDNN_ENFORCE(
Z
zchen0211 已提交
192
          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
Z
zchen0211 已提交
193
              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
Z
zchen0211 已提交
194 195 196 197
              cudnn_filter_desc,
              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
              workspace_size_limit, &filter_algo));

Z
zchen0211 已提交
198
      // get workspace for backwards filter algorithm
W
Wu Yi 已提交
199
      CUDNN_ENFORCE(
Z
zchen0211 已提交
200
          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
Z
zchen0211 已提交
201 202 203 204
              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
              cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
      workspace_size_in_bytes =
          std::max(workspace_size_in_bytes, bwd_filter_ws_size);
Z
zchen0211 已提交
205
    }
Z
zchen0211 已提交
206

Z
zchen0211 已提交
207 208
    // ------------------- cudnn conv backward data ---------------------
    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
209 210 211 212
    int input_offset = input->numel() / input->dims()[0] / groups;
    int output_grad_offset =
        output_grad->numel() / output_grad->dims()[0] / groups;
    int filter_offset = filter->numel() / groups;
Z
zchen0211 已提交
213
    T alpha = 1.0f, beta = 0.0f;
C
chengduo 已提交
214 215 216 217 218 219

    auto temp_allocation =
        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
            workspace_size_in_bytes);
    void* cudnn_workspace = temp_allocation->ptr();

Z
zchen0211 已提交
220 221
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
C
chengduoZH 已提交
222
      // Because beta is zero, it is unnecessary to reset input_grad.
223
      for (int g = 0; g < groups; g++) {
C
chengduo 已提交
224 225 226 227 228 229
        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
            handle, &alpha, cudnn_output_desc,
            output_grad_data + output_grad_offset * g, cudnn_filter_desc,
            filter_data + filter_offset * g, cudnn_conv_desc, data_algo,
            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
            input_grad_data + input_offset * g));
230
      }
Z
zchen0211 已提交
231
    }
Z
zchen0211 已提交
232

Z
zchen0211 已提交
233 234 235
    // ------------------- cudnn conv backward filter ---------------------
    if (filter_grad) {
      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
C
chengduoZH 已提交
236
      // Because beta is zero, it is unnecessary to reset filter_grad.
Z
zchen0211 已提交
237
      // Gradient with respect to the filter
238
      for (int g = 0; g < groups; g++) {
C
chengduo 已提交
239 240 241 242 243 244
        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
            handle, &alpha, cudnn_output_desc,
            output_grad_data + output_grad_offset * g, cudnn_input_desc,
            input_data + input_offset * g, cudnn_conv_desc, filter_algo,
            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_filter_desc,
            filter_grad_data + filter_offset * g));
245
      }
Z
zchen0211 已提交
246 247 248 249 250 251 252 253 254
    }
  }
};

}  // namespace operators
}  // namespace paddle

namespace ops = paddle::operators;

255 256 257 258 259 260 261 262 263 264 265 266 267
REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::CUDNNConvTransposeOpKernel<float>,
                   ops::CUDNNConvTransposeOpKernel<double>);
REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::CUDNNConvTransposeGradOpKernel<float>,
                   ops::CUDNNConvTransposeGradOpKernel<double>);

REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::CUDNNConvTransposeOpKernel<float>,
                   ops::CUDNNConvTransposeOpKernel<double>);
REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
                   ops::CUDNNConvTransposeGradOpKernel<float>,
                   ops::CUDNNConvTransposeGradOpKernel<double>);