conv_cudnn_v7.h 34.1 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

17
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
18
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
19
#include "paddle/phi/kernels/autotune/switch_autotune.h"
20
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
21

22
namespace phi {
Q
qingqing01 已提交
23

24
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
25

26 27
static inline double ToMegaBytes(size_t bytes) {
  return static_cast<double>(bytes) / (1 << 20);
28 29
}

30 31
static inline bool UseFixedWorkspace() {
  return FLAGS_conv_workspace_size_limit >= 0;
32 33
}

34 35
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
  if (!use_fixed_workspace) {
36
    int device_id = phi::backends::gpu::GetCurrentDeviceId();
37
    int64_t allocated =
38
        paddle::memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
39
    int64_t reserved =
40 41
        paddle::memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
    int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
42 43
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)
44 45
            << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
    return std::max(availble, reserved - allocated);
46 47
  } else {
    return FLAGS_conv_workspace_size_limit * 1024 * 1024;
48 49 50
  }
}

51 52 53
template <typename PerfT>
std::string GetPerfResultString(std::string prefix,
                                const std::vector<PerfT>& perf_results,
54 55
                                int actual_algo_count,
                                size_t workspace_limit) {
56 57 58 59 60 61 62 63 64 65
  std::ostringstream out;
  out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
      << " MB):\n";
  for (int i = 0; i < actual_algo_count; ++i) {
    const auto& result = perf_results[i];
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    out << "  algo=" << result.algo << ": tensor_core=" << math_type_str
        << ", time=" << result.time
        << " ms, memory=" << ToMegaBytes(result.memory)
        << " MB, status=" << result.status << "\n";
66
  }
67 68
  return out.str();
}
69

70 71
// Choose an algorithm which has the minimize time cost and less memory.
// NOTE: perf_results is ordered by time.
72 73 74
template <typename PerfT, typename AlgoT>
void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
                           size_t workspace_limit,
75 76
                           SearchResult<AlgoT>* search_result) {
  int best_algo_idx = -1;
77
  for (size_t i = 0; i < perf_results.size(); ++i) {
78
    const auto& result = perf_results[i];
79
    if (result.status == CUDNN_STATUS_SUCCESS &&
80
        result.memory <= workspace_limit) {
81 82 83 84 85 86 87 88 89
      if (best_algo_idx == -1) {
        // The algorithm which has minimize time cost and need a workspace_size
        // fitting the workspace_limit constraint.
        best_algo_idx = i;
        // Each perf_results[i].time is set to be -1 in heuristic search.
        if (perf_results[best_algo_idx].time < 0) {
          break;
        }
      } else {
90 91 92 93
        // Compared to the next suboptimal algorithm, if the best one only has
        // 1% performance difference, we'd like to pick the one which need less
        // memory.
        if (result.time < 1.01 * perf_results[best_algo_idx].time) {
94 95 96 97 98 99
          best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
                              ? i
                              : best_algo_idx;
          break;
        }
      }
100 101
    }
  }
102
  if (best_algo_idx != -1) {
103 104 105 106 107 108 109 110 111
    const auto& result = perf_results[best_algo_idx];
    search_result->algo = result.algo;
    search_result->time = result.time;
    search_result->workspace_size = result.memory;
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    VLOG(3) << "Choose algo=" << result.algo
            << ", tensor_core=" << math_type_str << ", time=" << result.time
            << " ms, memory=" << ToMegaBytes(result.memory)
            << " MB, status=" << result.status;
112 113 114 115
  } else {
    VLOG(3) << "Can not find an algorithm that requires memory < "
            << ToMegaBytes(workspace_limit) << " MB";
  }
116 117
}

118
template <ConvKind CK>
Y
Yiqun Liu 已提交
119
struct SearchAlgorithmBase {};
120

121 122 123 124
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
Q
qingqing01 已提交
125
template <>
126
struct SearchAlgorithmBase<ConvKind::kForward> {
127 128
  using PerfT = cudnnConvolutionFwdAlgoPerf_t;
  using AlgoT = cudnnConvolutionFwdAlgo_t;
129

Y
Yiqun Liu 已提交
130 131
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvForward;
Q
qingqing01 已提交
132

133 134
  static const std::string GetPerfName() { return "ConvForward"; }

135 136
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionFwdAlgo_t algo) {
Q
qingqing01 已提交
137
    size_t workspace_size = 0;
138
    PADDLE_ENFORCE_GPU_SUCCESS(
139 140 141 142 143 144 145
        phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle,
                                                              args.idesc.desc(),
                                                              args.wdesc.desc(),
                                                              args.cdesc.desc(),
                                                              args.odesc.desc(),
                                                              algo,
                                                              &workspace_size));
Q
qingqing01 已提交
146 147
    return workspace_size;
  }
148

Y
Yiqun Liu 已提交
149
 protected:
H
hong 已提交
150 151 152
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
153 154 155 156 157 158 159 160 161 162 163 164 165 166
  }

  // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
167
        phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
168 169 170 171 172 173 174 175
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            kNUM_CUDNN_FWD_ALGS,
            &actual_perf_count,
            perf_results.data()));
176 177 178 179 180
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
Y
Yiqun Liu 已提交
181 182 183 184
      VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
                                            perf_results,
                                            actual_perf_count,
                                            workspace_size_limit);
185
      // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
186 187
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
188 189 190 191 192 193
#else
      VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
                 "becasue the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
194
          phi::dynload::cudnnGetConvolutionForwardAlgorithm(
195 196 197 198 199
              args.handle,
              args.idesc.desc(),
              args.wdesc.desc(),
              args.cdesc.desc(),
              args.odesc.desc(),
200
              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
201 202
              workspace_size_limit,
              &(result.algo)));
203 204 205 206
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
207
        phi::dynload::cudnnGetConvolutionForwardAlgorithm(
208 209 210 211 212 213 214
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
            workspace_size_limit,
215 216
            &(result.algo)));
#endif
H
hong 已提交
217
    result.workspace_size = GetWorkspaceSize(args, result.algo);
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
235
          phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
236 237 238 239 240 241 242 243 244 245 246 247 248
              args.handle,
              args.idesc.desc(),
              args.x->data<T>(),
              args.wdesc.desc(),
              args.w->data<T>(),
              args.cdesc.desc(),
              args.odesc.desc(),
              const_cast<T*>(args.o->data<T>()),
              kNUM_CUDNN_FWD_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
249 250 251
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
252 253
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
254 255

    VLOG(4) << GetPerfResultString<PerfT>(
256 257 258 259 260 261
        "[Exhaustive Search] FwdAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
262

H
hong 已提交
263
    result.workspace_size = GetWorkspaceSize(args, result.algo);
264 265 266 267 268
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
269 270 271 272
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
        size_t workspace_size = 0;
273 274 275 276 277 278 279 280
        auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            static_cast<cudnnConvolutionFwdAlgo_t>(algo),
            &workspace_size);
281 282
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
283 284 285
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
286
      return max_workspace_size;
287 288 289 290
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
291 292
};

293 294 295 296 297 298
// cuDNN convolution backward data-algorithm searcher, consisting of three
// searching modes, namely: deterministic, heuristic, and exhaustive_search
// mode. Specially, there are 2 pattens of exhaustive search mode, one for
// HALF precision only, one for the rest.
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
Q
qingqing01 已提交
299
template <>
300
struct SearchAlgorithmBase<ConvKind::kBackwardData> {
301 302
  using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdDataAlgo_t;
303

Y
Yiqun Liu 已提交
304 305
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardData;
Q
qingqing01 已提交
306

307 308
  static const std::string GetPerfName() { return "ConvBackwardData"; }

309 310
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdDataAlgo_t algo) {
Q
qingqing01 已提交
311
    size_t workspace_size = 0;
312
    PADDLE_ENFORCE_GPU_SUCCESS(
313
        phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
314 315 316 317 318 319 320
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
321 322
    return workspace_size;
  }
323

Y
Yiqun Liu 已提交
324
 protected:
H
hong 已提交
325 326 327 328 329
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
330 331 332 333 334 335 336 337 338 339 340 341 342
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
343
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
344 345 346 347 348 349 350 351
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            kNUM_CUDNN_BWD_DATA_ALGS,
            &actual_perf_count,
            perf_results.data()));
352 353 354 355
    result.algo = perf_results[best_algo_idx].algo;

#if CUDNN_VERSION < 7500
    int stride_dim = args.x->dims().size() - 2;
356 357
    bool blacklist = std::any_of(args.s.begin(),
                                 args.s.begin() + stride_dim,
358 359 360 361 362 363 364 365 366 367 368 369
                                 [=](int n) { return n != 1; });
    if (blacklist && (perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
                      perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    }
#endif
    result.workspace_size = GetWorkspaceSize(args, result.algo);
    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
370 371
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
372 373 374 375 376 377
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
378
          phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
379 380 381 382 383
              args.handle,
              args.wdesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.idesc.desc(),
384
              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
385 386
              workspace_size_limit,
              &(result.algo)));
387 388 389 390
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
391
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
392 393 394 395 396
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
397
            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
398 399
            workspace_size_limit,
            &(result.algo)));
400
#endif
H
hong 已提交
401
    result.workspace_size = GetWorkspaceSize(args, result.algo);
402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
419
          phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
420 421 422 423 424 425 426 427 428 429 430 431 432
              args.handle,
              args.wdesc.desc(),
              args.w->data<T>(),
              args.odesc.desc(),
              args.o->data<T>(),
              args.cdesc.desc(),
              args.idesc.desc(),
              const_cast<T*>(args.x->data<T>()),
              kNUM_CUDNN_BWD_DATA_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
433 434 435
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
436 437
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
438 439

    VLOG(4) << GetPerfResultString<PerfT>(
440 441 442 443 444 445
        "[Exhaustive Search] BwdDataAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
446

H
hong 已提交
447
    result.workspace_size = GetWorkspaceSize(args, result.algo);
448 449 450 451 452
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
453 454 455 456 457
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
458
            phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
459 460 461 462 463
                args.handle,
                args.wdesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.idesc.desc(),
464 465
                static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
                &workspace_size);
466 467
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
468 469 470
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
471
      return max_workspace_size;
472 473 474 475
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
476 477
};

478 479 480 481
// cuDNN convution backward filter-algorithm searcher, consisted of three
// algorithm searching modes, namely: deterministic, heuristic, and
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
Q
qingqing01 已提交
482
template <>
483
struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
484 485
  using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
486

Y
Yiqun Liu 已提交
487 488
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardFilter;
Q
qingqing01 已提交
489

490 491
  static const std::string GetPerfName() { return "ConvBackwardFilter"; }

492 493
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdFilterAlgo_t algo) {
494
    paddle::platform::CUDAGraphCaptureModeGuard guard;
Q
qingqing01 已提交
495
    size_t workspace_size = 0;
496
    PADDLE_ENFORCE_GPU_SUCCESS(
497
        phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
498 499 500 501 502 503 504
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
505 506
    return workspace_size;
  }
507

Y
Yiqun Liu 已提交
508
 protected:
H
hong 已提交
509 510 511 512 513
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
514 515 516 517 518 519 520 521 522 523 524 525 526
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
527
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
528 529 530 531 532 533 534 535
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            kNUM_CUDNN_BWD_FILTER_ALGS,
            &actual_perf_count,
            perf_results.data()));
536 537 538 539 540 541
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
542 543
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
544 545 546 547 548 549
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
550
          phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
551 552 553 554 555
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
556
              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
557 558
              workspace_size_limit,
              &(result.algo)));
559 560 561 562
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
563
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
564 565 566 567 568
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
569
            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
570 571
            workspace_size_limit,
            &(result.algo)));
572 573
#endif

H
hong 已提交
574
    result.workspace_size = GetWorkspaceSize(args, result.algo);
575 576 577 578 579 580 581 582 583 584 585 586
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    int returned_algo_count = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    auto workspace_handle = ctx.cudnn_workspace_handle();
587
    if (paddle::platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
588 589 590 591 592 593 594
      size_t max_workspace_size =
          GetMaxWorkspaceSize(args, workspace_size_limit);
      VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
              << " MB";

      auto cudnn_find_func = [&](void* workspace_ptr) {
        PADDLE_ENFORCE_GPU_SUCCESS(
595
            phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
596 597 598 599 600 601 602 603 604 605 606 607 608
                args.handle,
                args.idesc.desc(),
                args.x->data<T>(),
                args.odesc.desc(),
                args.o->data<T>(),
                args.cdesc.desc(),
                args.wdesc.desc(),
                const_cast<T*>(args.w->data<T>()),
                kNUM_CUDNN_BWD_FILTER_ALGS,
                &returned_algo_count,
                perf_results.data(),
                workspace_ptr,
                max_workspace_size));
609
      };
610 611
      workspace_handle.RunFuncSync(
          cudnn_find_func, max_workspace_size, UseFixedWorkspace());
612 613

      VLOG(4) << GetPerfResultString<PerfT>(
614 615 616 617 618 619
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          returned_algo_count,
          workspace_size_limit);
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
620 621 622 623
    } else {
      int max_algos = GetAlgorithmMaxCount(args.handle);
      std::vector<PerfT> perf_results(max_algos);
      PADDLE_ENFORCE_GPU_SUCCESS(
624
          phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
625 626 627 628 629 630 631 632
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
              perf_results.size(),
              &returned_algo_count,
              perf_results.data()));
633 634 635
      perf_results.resize(returned_algo_count);

      VLOG(4) << GetPerfResultString<PerfT>(
636 637 638 639
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          perf_results.size(),
          workspace_size_limit);
640 641
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
642 643
    }

H
hong 已提交
644
    result.workspace_size = GetWorkspaceSize(args, result.algo);
645 646 647 648 649 650 651
    return result;
  }

  static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
#if CUDNN_VERSION_MIN(7, 0, 1)
    int max_algos = 0;
    auto status =
652
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
653 654 655 656 657 658 659 660 661 662 663 664
            handle, &max_algos);
    if (status == gpuSuccess) {
      VLOG(5) << "[BackwardFilter] max_algos: predefined="
              << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
      return max_algos;
    }
#endif
    return kNUM_CUDNN_BWD_FILTER_ALGS;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
665 666 667 668 669
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
670
            phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
671 672 673 674 675
                args.handle,
                args.idesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.wdesc.desc(),
676 677
                static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
                &workspace_size);
678 679
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
680 681 682
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
683
      return max_workspace_size;
684 685 686 687
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
688 689
};

690 691 692
template <ConvKind CK>
struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
  using AlgoT = typename SearchAlgorithmBase<CK>::AlgoT;
Y
Yiqun Liu 已提交
693 694

  template <typename T>
695 696
  static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
                                  const ConvArgs& args,
Y
Yiqun Liu 已提交
697 698
                                  bool exhaustive_search,
                                  bool deterministic,
699
                                  bool enable_autotune = true) {
Y
Yiqun Liu 已提交
700
    SearchResult<AlgoT> result;
701
    bool use_autotune = false;
702
    auto dtype = paddle::platform::CudnnDataType<T>::type;
Y
Yiqun Liu 已提交
703 704 705
    SetConvMathType(ctx, dtype, args.cdesc);

    if (deterministic) {
706
      result = SearchAlgorithmBase<CK>::FindAlgoDeterministic(args);
Y
Yiqun Liu 已提交
707 708
    } else {
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
709
      // 2. Once turning on auto-tune, run heuristic (default) before
Y
Yiqun Liu 已提交
710
      //    auto-tune process, run exhaustive_search during mentioned process.
711
      //    Auto tune is only enabled between specified range.
Y
Yiqun Liu 已提交
712 713
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
714
      auto key = args.ConvertToConvCacheKey<T>();
Y
Yiqun Liu 已提交
715
      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
716
          SearchAlgorithmBase<CK>::kAlgoType);
717 718
      bool find_in_cache = cache.Find(key);
      if (find_in_cache) {
Y
Yiqun Liu 已提交
719 720 721
        auto t = cache.Get(key);
        result.algo = static_cast<AlgoT>(t.algo);
        result.workspace_size = t.workspace_size;
722 723 724 725 726 727 728
        result.exhaustive_search = t.exhaustive_search;
      }
      if (!result.exhaustive_search) {
        // In conv2d_tranpose, enable_autotune is set to false because some
        // algorithm picked by exhaustive search method produce wrong result.
        use_autotune = enable_autotune &&
                       phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
Y
Yiqun Liu 已提交
729
        if (exhaustive_search || use_autotune) {
730 731
          // Once autotune is enabled, the autotuned result can rewrite the
          // previous result in cache found by heuristic method.
Y
Yiqun Liu 已提交
732
          result =
733
              SearchAlgorithmBase<CK>::template FindAlgoExhaustiveSearch<T>(
Y
Yiqun Liu 已提交
734
                  args, ctx);
735 736 737 738 739
          cache.Set(key,
                    phi::autotune::ConvAutoTuneResult(
                        static_cast<int64_t>(result.algo),
                        result.workspace_size,
                        true));
740
        } else if (!find_in_cache) {
741
          result = SearchAlgorithmBase<CK>::FindAlgoHeuristic(args, ctx);
742 743 744 745 746
          cache.Set(key,
                    phi::autotune::ConvAutoTuneResult(
                        static_cast<int64_t>(result.algo),
                        result.workspace_size,
                        false));
Y
Yiqun Liu 已提交
747 748 749
        }
      }
    }
750
    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<CK>::GetPerfName()
751 752
            << "] exhaustive_search=" << exhaustive_search
            << ", use_autotune=" << use_autotune
Y
Yiqun Liu 已提交
753 754 755 756 757 758
            << ", deterministic=" << deterministic
            << ", choose algo=" << result.algo
            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
    return result;
  }

759 760 761 762
  static void SetConvMathType(
      const phi::GPUContext& ctx,
      cudnnDataType_t dtype,
      const paddle::platform::ConvolutionDescriptor& cdesc) {
Y
Yiqun Liu 已提交
763 764
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
    if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
765
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
766 767 768 769 770 771 772
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
      VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
    } else if (ctx.GetComputeCapability() >= 80 &&
               dtype == CUDNN_DATA_BFLOAT16) {
      VLOG(5) << "Enable Tensor Core for BFLOAT16";
773
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
774 775 776 777
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif  // CUDNN_VERSION_MIN(8, 1, 0)
    } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
      VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
778
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
779 780 781
          cdesc.desc(), CUDNN_FMA_MATH));
#endif  // CUDA_VERSION >= 11000
    } else {
782
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
783 784 785 786 787 788
          cdesc.desc(), CUDNN_DEFAULT_MATH));
    }
#endif
  }
};

789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922
template <typename T, ConvKind CK>
struct ConvRunner {};

template <typename T>
struct ConvRunner<T, ConvKind::kForward> {
  static void Apply(
      const phi::GPUContext& ctx,
      const ConvArgs& args,
      const SearchResult<cudnnConvolutionFwdAlgo_t>& search_result,
      const T* input_ptr,
      const T* filter_ptr,
      T* output_ptr,
      int groups,
      int group_offset_in,
      int group_offset_filter,
      int group_offset_out,
      size_t workspace_size,
      phi::DnnWorkspaceHandle* workspace_handle,
      bool use_addto = false) {
    ScalingParamType<T> alpha = 1.0f;
    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;

    auto cudnn_handle = ctx.cudnn_handle();
    for (int i = 0; i < groups; i++) {
      workspace_handle->RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnConvolutionForward(
                cudnn_handle,
                &alpha,
                args.idesc.desc(),
                input_ptr + i * group_offset_in,
                args.wdesc.desc(),
                filter_ptr + i * group_offset_filter,
                args.cdesc.desc(),
                search_result.algo,
                workspace_ptr,
                workspace_size,
                &beta,
                args.odesc.desc(),
                output_ptr + i * group_offset_out));
          },
          workspace_size);
    }
  }
};

template <typename T>
struct ConvRunner<T, ConvKind::kBackwardData> {
  static void Apply(
      const phi::GPUContext& ctx,
      const ConvArgs& args,
      const SearchResult<cudnnConvolutionBwdDataAlgo_t>& search_result,
      const T* output_grad_ptr,
      const T* filter_ptr,
      T* input_grad_ptr,
      int groups,
      int group_offset_in,
      int group_offset_filter,
      int group_offset_out,
      size_t workspace_size,
      phi::DnnWorkspaceHandle* workspace_handle,
      bool use_addto = false) {
    ScalingParamType<T> alpha = 1.0f;
    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;

    auto cudnn_handle = ctx.cudnn_handle();
    for (int i = 0; i < groups; i++) {
      workspace_handle->RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
                phi::dynload::cudnnConvolutionBackwardData(
                    cudnn_handle,
                    &alpha,
                    args.wdesc.desc(),
                    filter_ptr + i * group_offset_filter,
                    args.odesc.desc(),
                    output_grad_ptr + i * group_offset_out,
                    args.cdesc.desc(),
                    search_result.algo,
                    workspace_ptr,
                    workspace_size,
                    &beta,
                    args.idesc.desc(),
                    input_grad_ptr + i * group_offset_in));
          },
          workspace_size);
    }
  }
};

template <typename T>
struct ConvRunner<T, ConvKind::kBackwardFilter> {
  static void Apply(
      const phi::GPUContext& ctx,
      const ConvArgs& args,
      const SearchResult<cudnnConvolutionBwdFilterAlgo_t>& search_result,
      const T* output_grad_ptr,
      const T* input_ptr,
      T* filter_grad_ptr,
      int groups,
      int group_offset_in,
      int group_offset_filter,
      int group_offset_out,
      size_t workspace_size,
      phi::DnnWorkspaceHandle* workspace_handle,
      bool use_addto = false) {
    ScalingParamType<T> alpha = 1.0f;
    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;

    auto cudnn_handle = ctx.cudnn_handle();
    for (int i = 0; i < groups; i++) {
      workspace_handle->RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
                phi::dynload::cudnnConvolutionBackwardFilter(
                    cudnn_handle,
                    &alpha,
                    args.idesc.desc(),
                    input_ptr + i * group_offset_in,
                    args.odesc.desc(),
                    output_grad_ptr + i * group_offset_out,
                    args.cdesc.desc(),
                    search_result.algo,
                    workspace_ptr,
                    workspace_size,
                    &beta,
                    args.wdesc.desc(),
                    filter_grad_ptr + i * group_offset_filter));
          },
          workspace_size);
    }
  }
};

923
}  // namespace phi