conv_cudnn_helper.h 27.6 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

17
#include "paddle/fluid/operators/conv_base_helper.h"
18
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
19 20
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
21

Q
qingqing01 已提交
22 23 24
namespace paddle {
namespace operators {

25
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
26 27

template <typename DeviceContext, typename T, size_t D>
H
hong 已提交
28
static void RemovePaddingSlice(const phi::GPUContext& context,
29 30 31
                               const Tensor* input, Tensor* out,
                               const std::vector<int>& starts,
                               const std::vector<int>& axes) {
H
hong 已提交
32
  auto& place = *context.eigen_device();
33 34
  auto in_dims = input->dims();
  auto new_out_dims = out->dims();
35 36
  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
37 38 39 40 41 42
  for (size_t i = 0; i < D; ++i) {
    offsets[i] = 0;
    extents[i] = new_out_dims[i];
  }

  for (size_t i = 0; i < axes.size(); ++i) {
43
    int start = starts[i];
44 45 46 47 48 49
    if (start < 0) {
      start = (start + in_dims[axes[i]]);
    }
    start = std::max(start, 0);
    offsets[axes[i]] = start;
  }
50

51 52 53 54 55 56
  auto in_t =
      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
          *input);
  auto out_t =
      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
          *out, new_out_dims);
57 58 59

  phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
      place, out_t, in_t, offsets, extents);
60 61
}

62 63
static inline double ToMegaBytes(size_t bytes) {
  return static_cast<double>(bytes) / (1 << 20);
64 65
}

66 67
static inline bool UseFixedWorkspace() {
  return FLAGS_conv_workspace_size_limit >= 0;
68 69
}

70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
static size_t CaclWorkspaceLimitInBytes(const phi::GPUContext& ctx) {
  if (!UseFixedWorkspace()) {
    int device_id = platform::GetCurrentDeviceId();
    int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
    int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
    int64_t availble = platform::GpuAvailableMemToAlloc();
    int64_t cur_workspace_size = ctx.cudnn_workspace_handle().WorkspaceSize();
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)
            << " MB, available_to_alloc=" << ToMegaBytes(availble)
            << " MB, current_workspace_size=" << ToMegaBytes(cur_workspace_size)
            << " MB.";
    return std::max(std::max(availble, cur_workspace_size),
                    reserved - allocated);
  } else {
    return FLAGS_conv_workspace_size_limit * 1024 * 1024;
86 87 88
  }
}

89 90 91 92 93 94 95 96 97 98 99 100 101 102
template <typename PerfT>
std::string GetPerfResultString(std::string prefix,
                                const std::vector<PerfT>& perf_results,
                                int actual_algo_count, size_t workspace_limit) {
  std::ostringstream out;
  out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
      << " MB):\n";
  for (int i = 0; i < actual_algo_count; ++i) {
    const auto& result = perf_results[i];
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    out << "  algo=" << result.algo << ": tensor_core=" << math_type_str
        << ", time=" << result.time
        << " ms, memory=" << ToMegaBytes(result.memory)
        << " MB, status=" << result.status << "\n";
103
  }
104 105
  return out.str();
}
106

107 108 109 110 111 112
template <typename PerfT, typename AlgoT>
void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
                           size_t workspace_limit,
                           SearchResult<AlgoT>* algo_result) {
  for (size_t i = 0; i < perf_results.size(); ++i) {
    auto result = perf_results[i];
113
    if (result.status == CUDNN_STATUS_SUCCESS &&
114 115 116 117 118 119 120 121 122
        result.memory < workspace_limit) {
      algo_result->algo = result.algo;
      algo_result->time = result.time;
      algo_result->workspace_size = result.memory;
      VLOG(3) << "  algo=" << result.algo << ", time=" << result.time
              << " ms, memory=" << ToMegaBytes(result.memory)
              << " MB (limit=" << ToMegaBytes(workspace_limit)
              << " MB), status=" << result.status;
      return;
123 124
    }
  }
125 126
  VLOG(3) << "Can not find an algorithm that requires memory < "
          << ToMegaBytes(workspace_limit) << " MB";
127 128
}

H
hong 已提交
129
static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
130 131
                            const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
132
  if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
133
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
134 135 136 137
        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
    VLOG(5) << "use cudnn_tensor_op_math";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
138
  } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
139
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
140 141 142
        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif  // CUDNN_VERSION_MIN(8, 1, 0)
  } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
143
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
144 145 146
        cdesc.desc(), CUDNN_FMA_MATH));
#endif  // CUDA_VERSION >= 11000
  } else {
147
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
148 149 150 151 152 153
        cdesc.desc(), CUDNN_DEFAULT_MATH));
    VLOG(5) << "NOT use cudnn_tensor_op_math";
  }
#endif
}

Q
qingqing01 已提交
154 155
template <>
struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
156 157
  using PerfT = cudnnConvolutionFwdAlgoPerf_t;
  using AlgoT = cudnnConvolutionFwdAlgo_t;
Q
qingqing01 已提交
158 159

  template <typename T>
160 161 162 163
  static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
                                  bool deterministic,
                                  const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
Q
qingqing01 已提交
164
    auto dtype = platform::CudnnDataType<T>::type;
165
    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
166
    SetConvMathType(ctx, dtype, args.cdesc);
167

168
    if (!exhaustive_search && !deterministic) {
169
#if CUDNN_VERSION >= 7001
170
      int actual_perf_count;
171
      int best_algo_idx = 0;
172
      std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
173
      PADDLE_ENFORCE_GPU_SUCCESS(
174 175 176
          platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
              args.handle, args.idesc.desc(), args.wdesc.desc(),
              args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
177 178 179
              &actual_perf_count, perf_results.data()));
      result.algo = perf_results[best_algo_idx].algo;
      result.workspace_size = perf_results[best_algo_idx].memory;
180

181
      if (result.workspace_size > workspace_size_limit) {
182
#if CUDNN_VERSION >= 8000
183
        // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
184 185
        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
                                            &result);
186
#else
187 188 189
        VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
                   "becasue the workspace size request("
                << result.workspace_size << ") exceeds the limit("
190
                << workspace_size_limit << ")";
191
        PADDLE_ENFORCE_GPU_SUCCESS(
192 193 194 195
            platform::dynload::cudnnGetConvolutionForwardAlgorithm(
                args.handle, args.idesc.desc(), args.wdesc.desc(),
                args.cdesc.desc(), args.odesc.desc(),
                CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
196
                workspace_size_limit, &(result.algo)));
197
#endif
198 199
      }
#else
200
      PADDLE_ENFORCE_GPU_SUCCESS(
201 202 203 204
          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
              args.handle, args.idesc.desc(), args.wdesc.desc(),
              args.cdesc.desc(), args.odesc.desc(),
              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
205
              workspace_size_limit, &(result.algo)));
206
#endif
207
    } else if (deterministic) {
208
      result.algo = static_cast<AlgoT>(1);
Q
qingqing01 已提交
209
    } else {
210
      auto workspace_handle = ctx.cudnn_workspace_handle();
211 212
      auto x_dims = phi::vectorize(args.x->dims());
      auto w_dims = phi::vectorize(args.w->dims());
213 214 215
      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
               << args.s << ", args.p" << args.p << ", args.d" << args.d;
216

217 218 219 220
      AlgorithmsCache<AlgoT>& algo_cache =
          *(framework::ConvSearchCache::Instance().GetForward());

      result.algo = algo_cache.GetAlgorithm(
221 222
          x_dims, w_dims, args.s, args.p, args.d, 0,
          static_cast<int64_t>(args.cudnn_dtype), [&]() {
Q
qingqing01 已提交
223
            int returned_algo_count;
224 225 226 227 228
            std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
            size_t max_workspace_size =
                FindMaxWorkspaceSize(args, workspace_size_limit);
            VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
                    << " MB";
Q
qingqing01 已提交
229 230

            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
231
              PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
232 233 234 235 236
                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
                      args.handle, args.idesc.desc(), args.x->data<T>(),
                      args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
                      args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
                      kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
237 238
                      perf_results.data(), cudnn_workspace_ptr,
                      max_workspace_size));
Q
qingqing01 已提交
239
            };
240 241 242 243 244 245 246 247
            workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
                                         UseFixedWorkspace());

            VLOG(4) << GetPerfResultString<PerfT>(
                "[Exhaustive Search] FwdAlgo Perf result", perf_results,
                returned_algo_count, workspace_size_limit);
            result.time = perf_results[0].time;
            return perf_results[0].algo;
Q
qingqing01 已提交
248 249
          });
    }
250 251 252 253 254
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
            << ", choose algo=" << result.algo << ", workspace="
            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
    return result;
Q
qingqing01 已提交
255 256
  }

257 258
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionFwdAlgo_t algo) {
Q
qingqing01 已提交
259
    size_t workspace_size = 0;
260
    PADDLE_ENFORCE_GPU_SUCCESS(
261 262 263
        platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
            args.handle, args.idesc.desc(), args.wdesc.desc(),
            args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size));
Q
qingqing01 已提交
264 265
    return workspace_size;
  }
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287

 private:
  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
                                     size_t workspace_size_limit) {
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
            platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
                args.handle, args.idesc.desc(), args.wdesc.desc(),
                args.cdesc.desc(), args.odesc.desc(),
                static_cast<cudnnConvolutionFwdAlgo_t>(algo), &workspace_size);
        if (status == CUDNN_STATUS_SUCCESS) {
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
      return std::min(max_workspace_size, workspace_size_limit);
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
288 289 290 291
};

template <>
struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
292 293
  using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdDataAlgo_t;
Q
qingqing01 已提交
294 295

  template <typename T>
296 297 298 299
  static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
                                  bool deterministic,
                                  const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
Q
qingqing01 已提交
300
    auto dtype = platform::CudnnDataType<T>::type;
301
    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
302
    SetConvMathType(ctx, dtype, args.cdesc);
303

304
    if (!exhaustive_search && !deterministic) {
305
#if CUDNN_VERSION >= 7001
306
      int actual_perf_count;
307
      int best_algo_idx = 0;
308
      std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
309
      PADDLE_ENFORCE_GPU_SUCCESS(
310 311 312
          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
              args.handle, args.wdesc.desc(), args.odesc.desc(),
              args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
313 314
              &actual_perf_count, perf_results.data()));
      result.algo = perf_results[best_algo_idx].algo;
315 316 317 318 319

#if CUDNN_VERSION < 7500
      int stride_dim = args.x->dims().size() - 2;
      bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim,
                                   [=](int n) { return n != 1; });
320
      if (blacklist && (perf_results[best_algo_idx].algo ==
321
                            CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
322
                        perf_results[best_algo_idx].algo ==
323
                            CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
324
        result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
325 326
      }
#endif
327 328
      result.workspace_size = GetWorkspaceSize(args, result.algo);
      if (result.workspace_size > workspace_size_limit) {
329
#if CUDNN_VERSION >= 8000
330
        // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
331 332
        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
                                            &result);
333 334 335
#else
        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                   "the workspace size request("
336
                << result.workspace_size << ") exceeds the limit("
337
                << workspace_size_limit << ")";
338
        PADDLE_ENFORCE_GPU_SUCCESS(
339 340 341 342
            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                args.handle, args.wdesc.desc(), args.odesc.desc(),
                args.cdesc.desc(), args.idesc.desc(),
                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
343
                workspace_size_limit, &(result.algo)));
344
#endif
345 346
      }
#else
347
      PADDLE_ENFORCE_GPU_SUCCESS(
348 349 350 351
          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
              args.handle, args.wdesc.desc(), args.odesc.desc(),
              args.cdesc.desc(), args.idesc.desc(),
              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
352
              workspace_size_limit, &(result.algo)));
353
#endif
Q
qingqing01 已提交
354
    } else if (deterministic) {
355
      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
Q
qingqing01 已提交
356
    } else {
357
      auto workspace_handle = ctx.cudnn_workspace_handle();
358 359
      auto x_dims = phi::vectorize(args.x->dims());
      auto w_dims = phi::vectorize(args.w->dims());
360 361 362
      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t"
               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
               << args.s << ", args.p" << args.p << ", args.d" << args.d;
363

364 365 366
      AlgorithmsCache<AlgoT>& algo_cache =
          *(framework::ConvSearchCache::Instance().GetBackwardData());
      result.algo = algo_cache.GetAlgorithm(
367 368
          x_dims, w_dims, args.s, args.p, args.d, 0,
          static_cast<int64_t>(args.cudnn_dtype), [&]() {
Q
qingqing01 已提交
369
            int returned_algo_count;
370 371 372 373 374
            std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
            size_t max_workspace_size =
                FindMaxWorkspaceSize(args, workspace_size_limit);
            VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
                    << " MB";
Q
qingqing01 已提交
375 376

            auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
377
              PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
378 379 380 381 382 383 384
                  platform::dynload::
                      cudnnFindConvolutionBackwardDataAlgorithmEx(
                          args.handle, args.wdesc.desc(), args.w->data<T>(),
                          args.odesc.desc(), args.o->data<T>(),
                          args.cdesc.desc(), args.idesc.desc(),
                          const_cast<T*>(args.x->data<T>()),
                          kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
385 386
                          perf_results.data(), cudnn_workspace_ptr,
                          max_workspace_size));
Q
qingqing01 已提交
387
            };
388 389 390 391 392 393 394 395
            workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
                                         UseFixedWorkspace());

            VLOG(3) << GetPerfResultString<PerfT>(
                "[Exhaustive Search] BwdDataAlgo Perf result", perf_results,
                returned_algo_count, workspace_size_limit);
            result.time = perf_results[0].time;
            return perf_results[0].algo;
Q
qingqing01 已提交
396 397
          });
    }
398 399 400 401 402
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
            << ", choose algo=" << result.algo << ", workspace="
            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
    return result;
Q
qingqing01 已提交
403 404
  }

405 406
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdDataAlgo_t algo) {
Q
qingqing01 已提交
407
    size_t workspace_size = 0;
408
    PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
409
        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
410 411
            args.handle, args.wdesc.desc(), args.odesc.desc(),
            args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size));
Q
qingqing01 已提交
412 413
    return workspace_size;
  }
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436

 private:
  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
                                     size_t workspace_size_limit) {
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
            platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
                args.handle, args.wdesc.desc(), args.odesc.desc(),
                args.cdesc.desc(), args.idesc.desc(),
                static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
                &workspace_size);
        if (status == CUDNN_STATUS_SUCCESS) {
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
      return std::min(max_workspace_size, workspace_size_limit);
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
437 438 439 440
};

template <>
struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
441 442
  using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
Q
qingqing01 已提交
443 444

  template <typename T>
445 446 447
  static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
                                  bool deterministic,
                                  const phi::GPUContext& ctx) {
448
    platform::CUDAGraphCaptureModeGuard guard;
449
    SearchResult<AlgoT> result;
Q
qingqing01 已提交
450
    auto dtype = platform::CudnnDataType<T>::type;
451
    size_t workspace_size_limit = CaclWorkspaceLimitInBytes(ctx);
452
    SetConvMathType(ctx, dtype, args.cdesc);
Q
qingqing01 已提交
453

454
    if (!exhaustive_search && !deterministic) {
455
#if CUDNN_VERSION >= 7001
456
      int actual_perf_count;
457
      int best_algo_idx = 0;
458
      std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
459
      PADDLE_ENFORCE_GPU_SUCCESS(
460 461 462
          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
              args.handle, args.idesc.desc(), args.odesc.desc(),
              args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
463 464 465
              &actual_perf_count, perf_results.data()));
      result.algo = perf_results[best_algo_idx].algo;
      result.workspace_size = perf_results[best_algo_idx].memory;
466

467
      if (result.workspace_size > workspace_size_limit) {
468 469
#if CUDNN_VERSION >= 8000
        // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
470 471
        ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
                                            &result);
472 473 474
#else
        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                   "the workspace size request("
475
                << result.workspace_size << ") exceeds the limit("
476
                << workspace_size_limit << ")";
477
        PADDLE_ENFORCE_GPU_SUCCESS(
478 479 480 481
            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                args.handle, args.idesc.desc(), args.odesc.desc(),
                args.cdesc.desc(), args.wdesc.desc(),
                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
482
                workspace_size_limit, &(result.algo)));
483
#endif
484 485
      }
#else
486
      PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
487 488 489 490
          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
              args.handle, args.idesc.desc(), args.odesc.desc(),
              args.cdesc.desc(), args.wdesc.desc(),
              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
491
              workspace_size_limit, &(result.algo)));
492
#endif
Q
qingqing01 已提交
493
    } else if (deterministic) {
494
      result.algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
Q
qingqing01 已提交
495
    } else {
496
      auto workspace_handle = ctx.cudnn_workspace_handle();
497 498
      auto x_dims = phi::vectorize(args.x->dims());
      auto w_dims = phi::vectorize(args.w->dims());
499 500 501
      VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
               << args.s << ", args.p" << args.p << ", args.d" << args.d;
502 503 504 505

      AlgorithmsCache<AlgoT>& algo_cache =
          *(framework::ConvSearchCache::Instance().GetBackwardFilter());

506
      if (dtype != CUDNN_DATA_HALF) {
507
        result.algo = algo_cache.GetAlgorithm(
508 509 510
            x_dims, w_dims, args.s, args.p, args.d, 0,
            static_cast<int64_t>(args.cudnn_dtype), [&]() {
              int returned_algo_count;
511 512 513 514 515 516
              std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
              size_t max_workspace_size =
                  FindMaxWorkspaceSize(args, workspace_size_limit);
              VLOG(3) << "max_workspace_size="
                      << ToMegaBytes(max_workspace_size) << " MB";

517
              auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
518
                PADDLE_ENFORCE_GPU_SUCCESS(
519 520 521 522 523 524 525
                    platform::dynload::
                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
                            args.handle, args.idesc.desc(), args.x->data<T>(),
                            args.odesc.desc(), args.o->data<T>(),
                            args.cdesc.desc(), args.wdesc.desc(),
                            const_cast<T*>(args.w->data<T>()),
                            kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
526 527
                            perf_results.data(), cudnn_workspace_ptr,
                            max_workspace_size));
528
              };
529 530 531 532 533 534 535 536
              workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
                                           UseFixedWorkspace());

              VLOG(3) << GetPerfResultString<PerfT>(
                  "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
                  returned_algo_count, workspace_size_limit);
              result.time = perf_results[0].time;
              return perf_results[0].algo;
537 538
            });
      } else {
539
        result.algo = algo_cache.GetAlgorithm(
540 541
            x_dims, w_dims, args.s, args.p, args.d, 0,
            static_cast<int64_t>(args.cudnn_dtype), [&]() {
542
              SearchResult<AlgoT> algo_result;
543
              int actual_algos = 0;
544 545
              std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);

546
              PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
547
                  platform::dynload::
548 549
                      cudnnFindConvolutionBackwardFilterAlgorithm(
                          args.handle, args.idesc.desc(), args.odesc.desc(),
Q
qingqing01 已提交
550
                          args.cdesc.desc(), args.wdesc.desc(),
551 552 553
                          perf_results.size(), &actual_algos,
                          perf_results.data()));
              perf_results.resize(actual_algos);
554 555 556
              ChooseAlgo(perf_results, workspace_size_limit, &algo_result);
              result.time = algo_result.time;
              return algo_result.algo;
557 558
            });
      }
Q
qingqing01 已提交
559
    }
560 561 562 563 564
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
            << ", choose algo=" << result.algo << ", workspace="
            << ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
    return result;
Q
qingqing01 已提交
565 566
  }

567 568
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdFilterAlgo_t algo) {
569
    platform::CUDAGraphCaptureModeGuard guard;
Q
qingqing01 已提交
570
    size_t workspace_size = 0;
571
    PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
572 573 574 575 576
        platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
            args.handle, args.idesc.desc(), args.odesc.desc(),
            args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size));
    return workspace_size;
  }
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639

 private:
  static size_t FindMaxWorkspaceSize(const ConvArgs& args,
                                     size_t workspace_size_limit) {
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
            platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
                args.handle, args.idesc.desc(), args.odesc.desc(),
                args.cdesc.desc(), args.wdesc.desc(),
                static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
                &workspace_size);
        if (status == CUDNN_STATUS_SUCCESS) {
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
      return std::min(max_workspace_size, workspace_size_limit);
    } else {
      return workspace_size_limit;
    }
  }

  static void ChooseAlgo(const std::vector<PerfT>& perf_results,
                         size_t workspace_limit,
                         SearchResult<AlgoT>* algo_result) {
    VLOG(3) << GetPerfResultString<PerfT>(
        "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
        perf_results.size(), workspace_limit);

    for (size_t i = 0; i != perf_results.size(); ++i) {
      const auto& result = perf_results[i];
      if (result.status == CUDNN_STATUS_SUCCESS &&
          (result.memory <= workspace_limit)) {
        if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
            (i != perf_results.size() - 1)) {
          const auto& next_result = perf_results[i + 1];
          if (next_result.status == CUDNN_STATUS_SUCCESS &&
              next_result.algo == result.algo &&
              next_result.memory == result.memory &&
              next_result.mathType != CUDNN_TENSOR_OP_MATH &&
              next_result.time < 1.01 * result.time) {
            // Skip over this result- it's not really a Tensor Core algo.
            // Because it is only 1% performance difference.
            // Prefer to choose the next equivalent non-Tensor Core algo.
            continue;
          }
        }
        algo_result->algo = result.algo;
        algo_result->time = result.time;
        auto math_type_str = "0";
        if (result.mathType == CUDNN_TENSOR_OP_MATH) {
          math_type_str = "1";
        }
        VLOG(3) << "    choose algo: " << result.algo
                << ", TC: " << math_type_str << ", time: " << result.time
                << " ms, wksp = " << result.memory
                << ", status = " << result.status;
        break;
      }
    }
  }
Q
qingqing01 已提交
640 641 642 643
};

}  // namespace operators
}  // namespace paddle