conv_cudnn_v7.h 30.0 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

17
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
18
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
19
#include "paddle/phi/kernels/autotune/switch_autotune.h"
20
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
21

22
namespace phi {
Q
qingqing01 已提交
23

24
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
25

26 27
static inline double ToMegaBytes(size_t bytes) {
  return static_cast<double>(bytes) / (1 << 20);
28 29
}

30 31
static inline bool UseFixedWorkspace() {
  return FLAGS_conv_workspace_size_limit >= 0;
32 33
}

34 35
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
  if (!use_fixed_workspace) {
36
    int device_id = phi::backends::gpu::GetCurrentDeviceId();
37
    int64_t allocated =
38
        paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
39
    int64_t reserved =
40 41
        paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
    int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
42 43
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)
44 45
            << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
    return std::max(availble, reserved - allocated);
46 47
  } else {
    return FLAGS_conv_workspace_size_limit * 1024 * 1024;
48 49 50
  }
}

51 52 53
template <typename PerfT>
std::string GetPerfResultString(std::string prefix,
                                const std::vector<PerfT>& perf_results,
54 55
                                int actual_algo_count,
                                size_t workspace_limit) {
56 57 58 59 60 61 62 63 64 65
  std::ostringstream out;
  out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
      << " MB):\n";
  for (int i = 0; i < actual_algo_count; ++i) {
    const auto& result = perf_results[i];
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    out << "  algo=" << result.algo << ": tensor_core=" << math_type_str
        << ", time=" << result.time
        << " ms, memory=" << ToMegaBytes(result.memory)
        << " MB, status=" << result.status << "\n";
66
  }
67 68
  return out.str();
}
69

70 71
// Choose an algorithm which has the minimize time cost and less memory.
// NOTE: perf_results is ordered by time.
72 73 74
template <typename PerfT, typename AlgoT>
void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
                           size_t workspace_limit,
75 76
                           SearchResult<AlgoT>* search_result) {
  int best_algo_idx = -1;
77
  for (size_t i = 0; i < perf_results.size(); ++i) {
78
    const auto& result = perf_results[i];
79
    if (result.status == CUDNN_STATUS_SUCCESS &&
80
        result.memory <= workspace_limit) {
81 82 83 84 85 86 87 88 89
      if (best_algo_idx == -1) {
        // The algorithm which has minimize time cost and need a workspace_size
        // fitting the workspace_limit constraint.
        best_algo_idx = i;
        // Each perf_results[i].time is set to be -1 in heuristic search.
        if (perf_results[best_algo_idx].time < 0) {
          break;
        }
      } else {
90 91 92 93
        // Compared to the next suboptimal algorithm, if the best one only has
        // 1% performance difference, we'd like to pick the one which need less
        // memory.
        if (result.time < 1.01 * perf_results[best_algo_idx].time) {
94 95 96 97 98 99
          best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
                              ? i
                              : best_algo_idx;
          break;
        }
      }
100 101
    }
  }
102
  if (best_algo_idx != -1) {
103 104 105 106 107 108 109 110 111
    const auto& result = perf_results[best_algo_idx];
    search_result->algo = result.algo;
    search_result->time = result.time;
    search_result->workspace_size = result.memory;
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    VLOG(3) << "Choose algo=" << result.algo
            << ", tensor_core=" << math_type_str << ", time=" << result.time
            << " ms, memory=" << ToMegaBytes(result.memory)
            << " MB, status=" << result.status;
112 113 114 115
  } else {
    VLOG(3) << "Can not find an algorithm that requires memory < "
            << ToMegaBytes(workspace_limit) << " MB";
  }
116 117
}

Y
Yiqun Liu 已提交
118 119
template <typename PerfT>
struct SearchAlgorithmBase {};
120

121 122 123 124
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
Q
qingqing01 已提交
125
template <>
Y
Yiqun Liu 已提交
126
struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
127 128
  using PerfT = cudnnConvolutionFwdAlgoPerf_t;
  using AlgoT = cudnnConvolutionFwdAlgo_t;
Y
Yiqun Liu 已提交
129 130
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvForward;
Q
qingqing01 已提交
131

132 133
  static const std::string GetPerfName() { return "ConvForward"; }

134 135
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionFwdAlgo_t algo) {
Q
qingqing01 已提交
136
    size_t workspace_size = 0;
137
    PADDLE_ENFORCE_GPU_SUCCESS(
138 139 140 141 142 143 144
        phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle,
                                                              args.idesc.desc(),
                                                              args.wdesc.desc(),
                                                              args.cdesc.desc(),
                                                              args.odesc.desc(),
                                                              algo,
                                                              &workspace_size));
Q
qingqing01 已提交
145 146
    return workspace_size;
  }
147

Y
Yiqun Liu 已提交
148
 protected:
H
hong 已提交
149 150 151
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
152 153 154 155 156 157 158 159 160 161 162 163 164 165
  }

  // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
166
        phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
167 168 169 170 171 172 173 174
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            kNUM_CUDNN_FWD_ALGS,
            &actual_perf_count,
            perf_results.data()));
175 176 177 178 179
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
Y
Yiqun Liu 已提交
180 181 182 183
      VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
                                            perf_results,
                                            actual_perf_count,
                                            workspace_size_limit);
184
      // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
185 186
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
187 188 189 190 191 192
#else
      VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
                 "becasue the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
193
          phi::dynload::cudnnGetConvolutionForwardAlgorithm(
194 195 196 197 198
              args.handle,
              args.idesc.desc(),
              args.wdesc.desc(),
              args.cdesc.desc(),
              args.odesc.desc(),
199
              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
200 201
              workspace_size_limit,
              &(result.algo)));
202 203 204 205
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
206
        phi::dynload::cudnnGetConvolutionForwardAlgorithm(
207 208 209 210 211 212 213
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
            workspace_size_limit,
214 215
            &(result.algo)));
#endif
H
hong 已提交
216
    result.workspace_size = GetWorkspaceSize(args, result.algo);
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
234
          phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
235 236 237 238 239 240 241 242 243 244 245 246 247
              args.handle,
              args.idesc.desc(),
              args.x->data<T>(),
              args.wdesc.desc(),
              args.w->data<T>(),
              args.cdesc.desc(),
              args.odesc.desc(),
              const_cast<T*>(args.o->data<T>()),
              kNUM_CUDNN_FWD_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
248 249 250
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
251 252
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
253 254

    VLOG(4) << GetPerfResultString<PerfT>(
255 256 257 258 259 260
        "[Exhaustive Search] FwdAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
261

H
hong 已提交
262
    result.workspace_size = GetWorkspaceSize(args, result.algo);
263 264 265 266 267
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
268 269 270 271
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
        size_t workspace_size = 0;
272 273 274 275 276 277 278 279
        auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            static_cast<cudnnConvolutionFwdAlgo_t>(algo),
            &workspace_size);
280 281
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
282 283 284
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
285
      return max_workspace_size;
286 287 288 289
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
290 291
};

292 293 294 295 296 297
// cuDNN convolution backward data-algorithm searcher, consisting of three
// searching modes, namely: deterministic, heuristic, and exhaustive_search
// mode. Specially, there are 2 pattens of exhaustive search mode, one for
// HALF precision only, one for the rest.
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
Q
qingqing01 已提交
298
template <>
Y
Yiqun Liu 已提交
299
struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
300 301
  using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdDataAlgo_t;
Y
Yiqun Liu 已提交
302 303
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardData;
Q
qingqing01 已提交
304

305 306
  static const std::string GetPerfName() { return "ConvBackwardData"; }

307 308
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdDataAlgo_t algo) {
Q
qingqing01 已提交
309
    size_t workspace_size = 0;
310
    PADDLE_ENFORCE_GPU_SUCCESS(
311
        phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
312 313 314 315 316 317 318
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
319 320
    return workspace_size;
  }
321

Y
Yiqun Liu 已提交
322
 protected:
H
hong 已提交
323 324 325 326 327
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
328 329 330 331 332 333 334 335 336 337 338 339 340
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
341
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
342 343 344 345 346 347 348 349
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            kNUM_CUDNN_BWD_DATA_ALGS,
            &actual_perf_count,
            perf_results.data()));
350 351 352 353
    result.algo = perf_results[best_algo_idx].algo;

#if CUDNN_VERSION < 7500
    int stride_dim = args.x->dims().size() - 2;
354 355
    bool blacklist = std::any_of(args.s.begin(),
                                 args.s.begin() + stride_dim,
356 357 358 359 360 361 362 363 364 365 366 367
                                 [=](int n) { return n != 1; });
    if (blacklist && (perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
                      perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    }
#endif
    result.workspace_size = GetWorkspaceSize(args, result.algo);
    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
368 369
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
370 371 372 373 374 375
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
376
          phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
377 378 379 380 381
              args.handle,
              args.wdesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.idesc.desc(),
382
              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
383 384
              workspace_size_limit,
              &(result.algo)));
385 386 387 388
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
389
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
390 391 392 393 394
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
395
            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
396 397
            workspace_size_limit,
            &(result.algo)));
398
#endif
H
hong 已提交
399
    result.workspace_size = GetWorkspaceSize(args, result.algo);
400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
417
          phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
418 419 420 421 422 423 424 425 426 427 428 429 430
              args.handle,
              args.wdesc.desc(),
              args.w->data<T>(),
              args.odesc.desc(),
              args.o->data<T>(),
              args.cdesc.desc(),
              args.idesc.desc(),
              const_cast<T*>(args.x->data<T>()),
              kNUM_CUDNN_BWD_DATA_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
431 432 433
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
434 435
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
436 437

    VLOG(4) << GetPerfResultString<PerfT>(
438 439 440 441 442 443
        "[Exhaustive Search] BwdDataAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
444

H
hong 已提交
445
    result.workspace_size = GetWorkspaceSize(args, result.algo);
446 447 448 449 450
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
451 452 453 454 455
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
456
            phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
457 458 459 460 461
                args.handle,
                args.wdesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.idesc.desc(),
462 463
                static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
                &workspace_size);
464 465
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
466 467 468
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
469
      return max_workspace_size;
470 471 472 473
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
474 475
};

476 477 478 479
// cuDNN convution backward filter-algorithm searcher, consisted of three
// algorithm searching modes, namely: deterministic, heuristic, and
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
Q
qingqing01 已提交
480
template <>
Y
Yiqun Liu 已提交
481
struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
482 483
  using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
Y
Yiqun Liu 已提交
484 485
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardFilter;
Q
qingqing01 已提交
486

487 488
  static const std::string GetPerfName() { return "ConvBackwardFilter"; }

489 490
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdFilterAlgo_t algo) {
491
    paddle::platform::CUDAGraphCaptureModeGuard guard;
Q
qingqing01 已提交
492
    size_t workspace_size = 0;
493
    PADDLE_ENFORCE_GPU_SUCCESS(
494
        phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
495 496 497 498 499 500 501
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
502 503
    return workspace_size;
  }
504

Y
Yiqun Liu 已提交
505
 protected:
H
hong 已提交
506 507 508 509 510
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
511 512 513 514 515 516 517 518 519 520 521 522 523
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
524
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
525 526 527 528 529 530 531 532
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            kNUM_CUDNN_BWD_FILTER_ALGS,
            &actual_perf_count,
            perf_results.data()));
533 534 535 536 537 538
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
539 540
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
541 542 543 544 545 546
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
547
          phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
548 549 550 551 552
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
553
              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
554 555
              workspace_size_limit,
              &(result.algo)));
556 557 558 559
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
560
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
561 562 563 564 565
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
566
            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
567 568
            workspace_size_limit,
            &(result.algo)));
569 570
#endif

H
hong 已提交
571
    result.workspace_size = GetWorkspaceSize(args, result.algo);
572 573 574 575 576 577 578 579 580 581 582 583
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    int returned_algo_count = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    auto workspace_handle = ctx.cudnn_workspace_handle();
584
    if (paddle::platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
585 586 587 588 589 590 591
      size_t max_workspace_size =
          GetMaxWorkspaceSize(args, workspace_size_limit);
      VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
              << " MB";

      auto cudnn_find_func = [&](void* workspace_ptr) {
        PADDLE_ENFORCE_GPU_SUCCESS(
592
            phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
593 594 595 596 597 598 599 600 601 602 603 604 605
                args.handle,
                args.idesc.desc(),
                args.x->data<T>(),
                args.odesc.desc(),
                args.o->data<T>(),
                args.cdesc.desc(),
                args.wdesc.desc(),
                const_cast<T*>(args.w->data<T>()),
                kNUM_CUDNN_BWD_FILTER_ALGS,
                &returned_algo_count,
                perf_results.data(),
                workspace_ptr,
                max_workspace_size));
606
      };
607 608
      workspace_handle.RunFuncSync(
          cudnn_find_func, max_workspace_size, UseFixedWorkspace());
609 610

      VLOG(4) << GetPerfResultString<PerfT>(
611 612 613 614 615 616
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          returned_algo_count,
          workspace_size_limit);
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
617 618 619 620
    } else {
      int max_algos = GetAlgorithmMaxCount(args.handle);
      std::vector<PerfT> perf_results(max_algos);
      PADDLE_ENFORCE_GPU_SUCCESS(
621
          phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
622 623 624 625 626 627 628 629
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
              perf_results.size(),
              &returned_algo_count,
              perf_results.data()));
630 631 632
      perf_results.resize(returned_algo_count);

      VLOG(4) << GetPerfResultString<PerfT>(
633 634 635 636
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          perf_results.size(),
          workspace_size_limit);
637 638
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
639 640
    }

H
hong 已提交
641
    result.workspace_size = GetWorkspaceSize(args, result.algo);
642 643 644 645 646 647 648
    return result;
  }

  static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
#if CUDNN_VERSION_MIN(7, 0, 1)
    int max_algos = 0;
    auto status =
649
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
650 651 652 653 654 655 656 657 658 659 660 661
            handle, &max_algos);
    if (status == gpuSuccess) {
      VLOG(5) << "[BackwardFilter] max_algos: predefined="
              << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
      return max_algos;
    }
#endif
    return kNUM_CUDNN_BWD_FILTER_ALGS;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
662 663 664 665 666
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
667
            phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
668 669 670 671 672
                args.handle,
                args.idesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.wdesc.desc(),
673 674
                static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
                &workspace_size);
675 676
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
677 678 679
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
680
      return max_workspace_size;
681 682 683 684
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
685 686
};

Y
Yiqun Liu 已提交
687 688 689 690 691
template <typename PerfT>
struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
  using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;

  template <typename T>
692 693
  static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
                                  const ConvArgs& args,
Y
Yiqun Liu 已提交
694 695
                                  bool exhaustive_search,
                                  bool deterministic,
696
                                  bool enable_autotune = true) {
Y
Yiqun Liu 已提交
697
    SearchResult<AlgoT> result;
698
    bool use_autotune = false;
699
    auto dtype = paddle::platform::CudnnDataType<T>::type;
Y
Yiqun Liu 已提交
700 701 702 703 704 705
    SetConvMathType(ctx, dtype, args.cdesc);

    if (deterministic) {
      result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
    } else {
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
706
      // 2. Once turning on auto-tune, run heuristic (default) before
Y
Yiqun Liu 已提交
707
      //    auto-tune process, run exhaustive_search during mentioned process.
708
      //    Auto tune is only enabled between specified range.
Y
Yiqun Liu 已提交
709 710
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
711
      auto key = args.ConvertToConvCacheKey<T>();
Y
Yiqun Liu 已提交
712 713
      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
          SearchAlgorithmBase<PerfT>::kAlgoType);
714 715
      bool find_in_cache = cache.Find(key);
      if (find_in_cache) {
Y
Yiqun Liu 已提交
716 717 718
        auto t = cache.Get(key);
        result.algo = static_cast<AlgoT>(t.algo);
        result.workspace_size = t.workspace_size;
719 720 721 722 723 724 725
        result.exhaustive_search = t.exhaustive_search;
      }
      if (!result.exhaustive_search) {
        // In conv2d_tranpose, enable_autotune is set to false because some
        // algorithm picked by exhaustive search method produce wrong result.
        use_autotune = enable_autotune &&
                       phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
Y
Yiqun Liu 已提交
726
        if (exhaustive_search || use_autotune) {
727 728
          // Once autotune is enabled, the autotuned result can rewrite the
          // previous result in cache found by heuristic method.
Y
Yiqun Liu 已提交
729 730 731
          result =
              SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
                  args, ctx);
732 733 734 735 736
          cache.Set(key,
                    phi::autotune::ConvAutoTuneResult(
                        static_cast<int64_t>(result.algo),
                        result.workspace_size,
                        true));
737
        } else if (!find_in_cache) {
Y
Yiqun Liu 已提交
738
          result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
739 740 741 742 743
          cache.Set(key,
                    phi::autotune::ConvAutoTuneResult(
                        static_cast<int64_t>(result.algo),
                        result.workspace_size,
                        false));
Y
Yiqun Liu 已提交
744 745 746
        }
      }
    }
747 748 749
    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
            << "] exhaustive_search=" << exhaustive_search
            << ", use_autotune=" << use_autotune
Y
Yiqun Liu 已提交
750 751 752 753 754 755
            << ", deterministic=" << deterministic
            << ", choose algo=" << result.algo
            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
    return result;
  }

756 757 758 759
  static void SetConvMathType(
      const phi::GPUContext& ctx,
      cudnnDataType_t dtype,
      const paddle::platform::ConvolutionDescriptor& cdesc) {
Y
Yiqun Liu 已提交
760 761
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
    if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
762
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
763 764 765 766 767 768 769
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
      VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
    } else if (ctx.GetComputeCapability() >= 80 &&
               dtype == CUDNN_DATA_BFLOAT16) {
      VLOG(5) << "Enable Tensor Core for BFLOAT16";
770
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
771 772 773 774
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif  // CUDNN_VERSION_MIN(8, 1, 0)
    } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
      VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
775
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
776 777 778
          cdesc.desc(), CUDNN_FMA_MATH));
#endif  // CUDA_VERSION >= 11000
    } else {
779
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
780 781 782 783 784 785
          cdesc.desc(), CUDNN_DEFAULT_MATH));
    }
#endif
  }
};

786
}  // namespace phi