conv_cudnn_v7.h 30.9 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

17
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
18
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
19
#include "paddle/phi/kernels/autotune/switch_autotune.h"
20
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
21

22
namespace phi {
Q
qingqing01 已提交
23

24
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
25

26 27
static inline double ToMegaBytes(size_t bytes) {
  return static_cast<double>(bytes) / (1 << 20);
28 29
}

30 31
static inline bool UseFixedWorkspace() {
  return FLAGS_conv_workspace_size_limit >= 0;
32 33
}

34 35
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
  if (!use_fixed_workspace) {
36
    int device_id = phi::backends::gpu::GetCurrentDeviceId();
37
    int64_t allocated =
38
        paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
39
    int64_t reserved =
40 41
        paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
    int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
42 43
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)
44 45
            << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
    return std::max(availble, reserved - allocated);
46 47
  } else {
    return FLAGS_conv_workspace_size_limit * 1024 * 1024;
48 49 50
  }
}

51 52 53
template <typename PerfT>
std::string GetPerfResultString(std::string prefix,
                                const std::vector<PerfT>& perf_results,
54 55
                                int actual_algo_count,
                                size_t workspace_limit) {
56 57 58 59 60 61 62 63 64 65
  std::ostringstream out;
  out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
      << " MB):\n";
  for (int i = 0; i < actual_algo_count; ++i) {
    const auto& result = perf_results[i];
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    out << "  algo=" << result.algo << ": tensor_core=" << math_type_str
        << ", time=" << result.time
        << " ms, memory=" << ToMegaBytes(result.memory)
        << " MB, status=" << result.status << "\n";
66
  }
67 68
  return out.str();
}
69

70 71
// Choose an algorithm which has the minimize time cost and less memory.
// NOTE: perf_results is ordered by time.
72 73 74
template <typename PerfT, typename AlgoT>
void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
                           size_t workspace_limit,
75 76
                           SearchResult<AlgoT>* search_result) {
  int best_algo_idx = -1;
77 78
  for (size_t i = 0; i < perf_results.size(); ++i) {
    auto result = perf_results[i];
79
    if (result.status == CUDNN_STATUS_SUCCESS &&
80
        result.memory < workspace_limit) {
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
      if (best_algo_idx == -1) {
        // The algorithm which has minimize time cost and need a workspace_size
        // fitting the workspace_limit constraint.
        best_algo_idx = i;
        // Each perf_results[i].time is set to be -1 in heuristic search.
        if (perf_results[best_algo_idx].time < 0) {
          break;
        }
      } else {
        float best_algo_time = perf_results[best_algo_idx].time;
        if ((result.time - best_algo_time) / best_algo_time < 0.01) {
          best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
                              ? i
                              : best_algo_idx;
          break;
        }
      }
98 99
    }
  }
100 101 102 103 104 105 106 107
  if (best_algo_idx != -1) {
    search_result->algo = perf_results[best_algo_idx].algo;
    search_result->time = perf_results[best_algo_idx].time;
    search_result->workspace_size = perf_results[best_algo_idx].memory;
  } else {
    VLOG(3) << "Can not find an algorithm that requires memory < "
            << ToMegaBytes(workspace_limit) << " MB";
  }
108 109
}

Y
Yiqun Liu 已提交
110 111
template <typename PerfT>
struct SearchAlgorithmBase {};
112

113 114 115 116
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
Q
qingqing01 已提交
117
template <>
Y
Yiqun Liu 已提交
118
struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
119 120
  using PerfT = cudnnConvolutionFwdAlgoPerf_t;
  using AlgoT = cudnnConvolutionFwdAlgo_t;
Y
Yiqun Liu 已提交
121 122
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvForward;
Q
qingqing01 已提交
123

124 125
  static const std::string GetPerfName() { return "ConvForward"; }

126 127
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionFwdAlgo_t algo) {
Q
qingqing01 已提交
128
    size_t workspace_size = 0;
129
    PADDLE_ENFORCE_GPU_SUCCESS(
130 131 132 133 134 135 136
        phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle,
                                                              args.idesc.desc(),
                                                              args.wdesc.desc(),
                                                              args.cdesc.desc(),
                                                              args.odesc.desc(),
                                                              algo,
                                                              &workspace_size));
Q
qingqing01 已提交
137 138
    return workspace_size;
  }
139

Y
Yiqun Liu 已提交
140
 protected:
H
hong 已提交
141 142 143
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
144 145 146 147 148 149 150 151 152 153 154 155 156 157
  }

  // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
158
        phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
159 160 161 162 163 164 165 166
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            kNUM_CUDNN_FWD_ALGS,
            &actual_perf_count,
            perf_results.data()));
167 168 169 170 171
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
Y
Yiqun Liu 已提交
172 173 174 175
      VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
                                            perf_results,
                                            actual_perf_count,
                                            workspace_size_limit);
176
      // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
177 178
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
179 180 181 182 183 184
#else
      VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
                 "becasue the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
185
          phi::dynload::cudnnGetConvolutionForwardAlgorithm(
186 187 188 189 190
              args.handle,
              args.idesc.desc(),
              args.wdesc.desc(),
              args.cdesc.desc(),
              args.odesc.desc(),
191
              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
192 193
              workspace_size_limit,
              &(result.algo)));
194 195 196 197
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
198
        phi::dynload::cudnnGetConvolutionForwardAlgorithm(
199 200 201 202 203 204 205
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
            workspace_size_limit,
206 207
            &(result.algo)));
#endif
H
hong 已提交
208
    result.workspace_size = GetWorkspaceSize(args, result.algo);
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
226
          phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
227 228 229 230 231 232 233 234 235 236 237 238 239
              args.handle,
              args.idesc.desc(),
              args.x->data<T>(),
              args.wdesc.desc(),
              args.w->data<T>(),
              args.cdesc.desc(),
              args.odesc.desc(),
              const_cast<T*>(args.o->data<T>()),
              kNUM_CUDNN_FWD_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
240 241 242
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
243 244
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
245 246

    VLOG(4) << GetPerfResultString<PerfT>(
247 248 249 250 251 252
        "[Exhaustive Search] FwdAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
253

H
hong 已提交
254
    result.workspace_size = GetWorkspaceSize(args, result.algo);
255 256 257 258 259
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
260 261 262 263
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
        size_t workspace_size = 0;
264 265 266 267 268 269 270 271
        auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            static_cast<cudnnConvolutionFwdAlgo_t>(algo),
            &workspace_size);
272 273
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
274 275 276
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
277
      return max_workspace_size;
278 279 280 281
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
282 283
};

284 285 286 287 288 289
// cuDNN convolution backward data-algorithm searcher, consisting of three
// searching modes, namely: deterministic, heuristic, and exhaustive_search
// mode. Specially, there are 2 pattens of exhaustive search mode, one for
// HALF precision only, one for the rest.
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
Q
qingqing01 已提交
290
template <>
Y
Yiqun Liu 已提交
291
struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
292 293
  using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdDataAlgo_t;
Y
Yiqun Liu 已提交
294 295
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardData;
Q
qingqing01 已提交
296

297 298
  static const std::string GetPerfName() { return "ConvBackwardData"; }

299 300
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdDataAlgo_t algo) {
Q
qingqing01 已提交
301
    size_t workspace_size = 0;
302
    PADDLE_ENFORCE_GPU_SUCCESS(
303
        phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
304 305 306 307 308 309 310
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
311 312
    return workspace_size;
  }
313

Y
Yiqun Liu 已提交
314
 protected:
H
hong 已提交
315 316 317 318 319
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
320 321 322 323 324 325 326 327 328 329 330 331 332
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
333
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
334 335 336 337 338 339 340 341
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            kNUM_CUDNN_BWD_DATA_ALGS,
            &actual_perf_count,
            perf_results.data()));
342 343 344 345
    result.algo = perf_results[best_algo_idx].algo;

#if CUDNN_VERSION < 7500
    int stride_dim = args.x->dims().size() - 2;
346 347
    bool blacklist = std::any_of(args.s.begin(),
                                 args.s.begin() + stride_dim,
348 349 350 351 352 353 354 355 356 357 358 359
                                 [=](int n) { return n != 1; });
    if (blacklist && (perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
                      perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    }
#endif
    result.workspace_size = GetWorkspaceSize(args, result.algo);
    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
360 361
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
362 363 364 365 366 367
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
368
          phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
369 370 371 372 373
              args.handle,
              args.wdesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.idesc.desc(),
374
              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
375 376
              workspace_size_limit,
              &(result.algo)));
377 378 379 380
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
381
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
382 383 384 385 386
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
387
            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
388 389
            workspace_size_limit,
            &(result.algo)));
390
#endif
H
hong 已提交
391
    result.workspace_size = GetWorkspaceSize(args, result.algo);
392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
409
          phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
410 411 412 413 414 415 416 417 418 419 420 421 422
              args.handle,
              args.wdesc.desc(),
              args.w->data<T>(),
              args.odesc.desc(),
              args.o->data<T>(),
              args.cdesc.desc(),
              args.idesc.desc(),
              const_cast<T*>(args.x->data<T>()),
              kNUM_CUDNN_BWD_DATA_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
423 424 425
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
426 427
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
428 429

    VLOG(4) << GetPerfResultString<PerfT>(
430 431 432 433 434 435
        "[Exhaustive Search] BwdDataAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
436

H
hong 已提交
437
    result.workspace_size = GetWorkspaceSize(args, result.algo);
438 439 440 441 442
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
443 444 445 446 447
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
448
            phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
449 450 451 452 453
                args.handle,
                args.wdesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.idesc.desc(),
454 455
                static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
                &workspace_size);
456 457
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
458 459 460
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
461
      return max_workspace_size;
462 463 464 465
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
466 467
};

468 469 470 471
// cuDNN convution backward filter-algorithm searcher, consisted of three
// algorithm searching modes, namely: deterministic, heuristic, and
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
Q
qingqing01 已提交
472
template <>
Y
Yiqun Liu 已提交
473
struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
474 475
  using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
Y
Yiqun Liu 已提交
476 477
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardFilter;
Q
qingqing01 已提交
478

479 480
  static const std::string GetPerfName() { return "ConvBackwardFilter"; }

481 482
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdFilterAlgo_t algo) {
483
    paddle::platform::CUDAGraphCaptureModeGuard guard;
Q
qingqing01 已提交
484
    size_t workspace_size = 0;
485
    PADDLE_ENFORCE_GPU_SUCCESS(
486
        phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
487 488 489 490 491 492 493
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
494 495
    return workspace_size;
  }
496

Y
Yiqun Liu 已提交
497
 protected:
H
hong 已提交
498 499 500 501 502
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
503 504 505 506 507 508 509 510 511 512 513 514 515
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
516
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
517 518 519 520 521 522 523 524
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            kNUM_CUDNN_BWD_FILTER_ALGS,
            &actual_perf_count,
            perf_results.data()));
525 526 527 528 529 530
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
531 532
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
533 534 535 536 537 538
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
539
          phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
540 541 542 543 544
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
545
              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
546 547
              workspace_size_limit,
              &(result.algo)));
548 549 550 551
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
552
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
553 554 555 556 557
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
558
            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
559 560
            workspace_size_limit,
            &(result.algo)));
561 562
#endif

H
hong 已提交
563
    result.workspace_size = GetWorkspaceSize(args, result.algo);
564 565 566 567 568 569 570 571 572 573 574 575
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    int returned_algo_count = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    auto workspace_handle = ctx.cudnn_workspace_handle();
576
    if (paddle::platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
577 578 579 580 581 582 583
      size_t max_workspace_size =
          GetMaxWorkspaceSize(args, workspace_size_limit);
      VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
              << " MB";

      auto cudnn_find_func = [&](void* workspace_ptr) {
        PADDLE_ENFORCE_GPU_SUCCESS(
584
            phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
585 586 587 588 589 590 591 592 593 594 595 596 597
                args.handle,
                args.idesc.desc(),
                args.x->data<T>(),
                args.odesc.desc(),
                args.o->data<T>(),
                args.cdesc.desc(),
                args.wdesc.desc(),
                const_cast<T*>(args.w->data<T>()),
                kNUM_CUDNN_BWD_FILTER_ALGS,
                &returned_algo_count,
                perf_results.data(),
                workspace_ptr,
                max_workspace_size));
598
      };
599 600
      workspace_handle.RunFuncSync(
          cudnn_find_func, max_workspace_size, UseFixedWorkspace());
601 602

      VLOG(4) << GetPerfResultString<PerfT>(
603 604 605 606 607 608
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          returned_algo_count,
          workspace_size_limit);
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
609 610 611 612
    } else {
      int max_algos = GetAlgorithmMaxCount(args.handle);
      std::vector<PerfT> perf_results(max_algos);
      PADDLE_ENFORCE_GPU_SUCCESS(
613
          phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
614 615 616 617 618 619 620 621
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
              perf_results.size(),
              &returned_algo_count,
              perf_results.data()));
622 623 624
      perf_results.resize(returned_algo_count);

      VLOG(4) << GetPerfResultString<PerfT>(
625 626 627 628
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          perf_results.size(),
          workspace_size_limit);
629 630 631
      ChooseAlgo(perf_results, workspace_size_limit, &result);
    }

H
hong 已提交
632
    result.workspace_size = GetWorkspaceSize(args, result.algo);
633 634 635 636 637 638 639
    return result;
  }

  static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
#if CUDNN_VERSION_MIN(7, 0, 1)
    int max_algos = 0;
    auto status =
640
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
641 642 643 644 645 646 647 648 649 650 651 652
            handle, &max_algos);
    if (status == gpuSuccess) {
      VLOG(5) << "[BackwardFilter] max_algos: predefined="
              << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
      return max_algos;
    }
#endif
    return kNUM_CUDNN_BWD_FILTER_ALGS;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
653 654 655 656 657
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
658
            phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
659 660 661 662 663
                args.handle,
                args.idesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.wdesc.desc(),
664 665
                static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
                &workspace_size);
666 667
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
668 669 670
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
671
      return max_workspace_size;
672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711
    } else {
      return workspace_size_limit;
    }
  }

  static void ChooseAlgo(const std::vector<PerfT>& perf_results,
                         size_t workspace_limit,
                         SearchResult<AlgoT>* algo_result) {
    for (size_t i = 0; i != perf_results.size(); ++i) {
      const auto& result = perf_results[i];
      if (result.status == CUDNN_STATUS_SUCCESS &&
          (result.memory <= workspace_limit)) {
        if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
            (i != perf_results.size() - 1)) {
          const auto& next_result = perf_results[i + 1];
          if (next_result.status == CUDNN_STATUS_SUCCESS &&
              next_result.algo == result.algo &&
              next_result.memory == result.memory &&
              next_result.mathType != CUDNN_TENSOR_OP_MATH &&
              next_result.time < 1.01 * result.time) {
            // Skip over this result- it's not really a Tensor Core algo.
            // Because it is only 1% performance difference.
            // Prefer to choose the next equivalent non-Tensor Core algo.
            continue;
          }
        }
        algo_result->algo = result.algo;
        algo_result->time = result.time;
        auto math_type_str = "0";
        if (result.mathType == CUDNN_TENSOR_OP_MATH) {
          math_type_str = "1";
        }
        VLOG(3) << "    choose algo: " << result.algo
                << ", TC: " << math_type_str << ", time: " << result.time
                << " ms, wksp = " << result.memory
                << ", status = " << result.status;
        break;
      }
    }
  }
Q
qingqing01 已提交
712 713
};

Y
Yiqun Liu 已提交
714 715 716 717 718
template <typename PerfT>
struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
  using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;

  template <typename T>
719 720
  static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
                                  const ConvArgs& args,
Y
Yiqun Liu 已提交
721 722
                                  bool exhaustive_search,
                                  bool deterministic,
723
                                  bool enable_autotune = true) {
Y
Yiqun Liu 已提交
724
    SearchResult<AlgoT> result;
725
    bool use_autotune = false;
726
    auto dtype = paddle::platform::CudnnDataType<T>::type;
Y
Yiqun Liu 已提交
727 728 729 730 731 732
    SetConvMathType(ctx, dtype, args.cdesc);

    if (deterministic) {
      result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
    } else {
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
733
      // 2. Once turning on auto-tune, run heuristic (default) before
Y
Yiqun Liu 已提交
734
      //    auto-tune process, run exhaustive_search during mentioned process.
735
      //    Auto tune is only enabled between specified range.
Y
Yiqun Liu 已提交
736 737 738 739 740
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
      auto key = args.Convert2ConvCacheKey<T>();
      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
          SearchAlgorithmBase<PerfT>::kAlgoType);
741 742
      bool find_in_cache = cache.Find(key);
      if (find_in_cache) {
Y
Yiqun Liu 已提交
743 744 745
        auto t = cache.Get(key);
        result.algo = static_cast<AlgoT>(t.algo);
        result.workspace_size = t.workspace_size;
746 747 748 749 750 751 752 753
        result.exhaustive_search = t.exhaustive_search;
      }
      if (!result.exhaustive_search) {
        bool need_update_cache = false;
        // In conv2d_tranpose, enable_autotune is set to false because some
        // algorithm picked by exhaustive search method produce wrong result.
        use_autotune = enable_autotune &&
                       phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
Y
Yiqun Liu 已提交
754
        if (exhaustive_search || use_autotune) {
755 756
          // Once autotune is enabled, the autotuned result can rewrite the
          // previous result in cache found by heuristic method.
Y
Yiqun Liu 已提交
757 758 759
          result =
              SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
                  args, ctx);
760 761
          need_update_cache = true;
        } else if (!find_in_cache) {
Y
Yiqun Liu 已提交
762
          result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
763 764 765 766 767 768 769 770
          need_update_cache = true;
        }
        if (need_update_cache) {
          phi::autotune::ConvAutoTuneResult node(
              static_cast<int64_t>(result.algo),
              result.workspace_size,
              exhaustive_search || use_autotune);
          cache.Set(key, node);
Y
Yiqun Liu 已提交
771 772 773
        }
      }
    }
774 775 776
    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
            << "] exhaustive_search=" << exhaustive_search
            << ", use_autotune=" << use_autotune
Y
Yiqun Liu 已提交
777 778 779 780 781 782
            << ", deterministic=" << deterministic
            << ", choose algo=" << result.algo
            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
    return result;
  }

783 784 785 786
  static void SetConvMathType(
      const phi::GPUContext& ctx,
      cudnnDataType_t dtype,
      const paddle::platform::ConvolutionDescriptor& cdesc) {
Y
Yiqun Liu 已提交
787 788
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
    if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
789
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
790 791 792 793 794 795 796
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
      VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
    } else if (ctx.GetComputeCapability() >= 80 &&
               dtype == CUDNN_DATA_BFLOAT16) {
      VLOG(5) << "Enable Tensor Core for BFLOAT16";
797
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
798 799 800 801
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif  // CUDNN_VERSION_MIN(8, 1, 0)
    } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
      VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
802
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
803 804 805
          cdesc.desc(), CUDNN_FMA_MATH));
#endif  // CUDA_VERSION >= 11000
    } else {
806
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
807 808 809 810 811 812
          cdesc.desc(), CUDNN_DEFAULT_MATH));
    }
#endif
  }
};

813
}  // namespace phi