conv_cudnn_v7.h 33.2 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

17
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
18
#include "paddle/phi/kernels/autotune/switch_autotune.h"
19
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
20

21
namespace phi {
Q
qingqing01 已提交
22

23
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
24

25 26 27
template <typename PerfT>
std::string GetPerfResultString(std::string prefix,
                                const std::vector<PerfT>& perf_results,
28 29
                                int actual_algo_count,
                                size_t workspace_limit) {
30 31 32 33 34 35 36 37 38 39
  std::ostringstream out;
  out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
      << " MB):\n";
  for (int i = 0; i < actual_algo_count; ++i) {
    const auto& result = perf_results[i];
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    out << "  algo=" << result.algo << ": tensor_core=" << math_type_str
        << ", time=" << result.time
        << " ms, memory=" << ToMegaBytes(result.memory)
        << " MB, status=" << result.status << "\n";
40
  }
41 42
  return out.str();
}
43

44 45
// Choose an algorithm which has the minimize time cost and less memory.
// NOTE: perf_results is ordered by time.
46 47 48
template <typename PerfT, typename AlgoT>
void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
                           size_t workspace_limit,
49 50
                           SearchResult<AlgoT>* search_result) {
  int best_algo_idx = -1;
51
  for (size_t i = 0; i < perf_results.size(); ++i) {
52
    const auto& result = perf_results[i];
53
    if (result.status == CUDNN_STATUS_SUCCESS &&
54
        result.memory <= workspace_limit) {
55 56 57 58 59 60 61 62 63
      if (best_algo_idx == -1) {
        // The algorithm which has minimize time cost and need a workspace_size
        // fitting the workspace_limit constraint.
        best_algo_idx = i;
        // Each perf_results[i].time is set to be -1 in heuristic search.
        if (perf_results[best_algo_idx].time < 0) {
          break;
        }
      } else {
64 65 66 67
        // Compared to the next suboptimal algorithm, if the best one only has
        // 1% performance difference, we'd like to pick the one which need less
        // memory.
        if (result.time < 1.01 * perf_results[best_algo_idx].time) {
68 69 70 71 72 73
          best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
                              ? i
                              : best_algo_idx;
          break;
        }
      }
74 75
    }
  }
76
  if (best_algo_idx != -1) {
77 78 79 80 81 82 83 84 85
    const auto& result = perf_results[best_algo_idx];
    search_result->algo = result.algo;
    search_result->time = result.time;
    search_result->workspace_size = result.memory;
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    VLOG(3) << "Choose algo=" << result.algo
            << ", tensor_core=" << math_type_str << ", time=" << result.time
            << " ms, memory=" << ToMegaBytes(result.memory)
            << " MB, status=" << result.status;
86 87 88 89
  } else {
    VLOG(3) << "Can not find an algorithm that requires memory < "
            << ToMegaBytes(workspace_limit) << " MB";
  }
90 91
}

92
template <ConvKind CK>
Y
Yiqun Liu 已提交
93
struct SearchAlgorithmBase {};
94

95 96 97 98
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
Q
qingqing01 已提交
99
template <>
100
struct SearchAlgorithmBase<ConvKind::kForward> {
101 102
  using PerfT = cudnnConvolutionFwdAlgoPerf_t;
  using AlgoT = cudnnConvolutionFwdAlgo_t;
103

Y
Yiqun Liu 已提交
104 105
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvForward;
Q
qingqing01 已提交
106

107 108
  static const std::string GetPerfName() { return "ConvForward"; }

109 110
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionFwdAlgo_t algo) {
Q
qingqing01 已提交
111
    size_t workspace_size = 0;
112
    PADDLE_ENFORCE_GPU_SUCCESS(
113 114 115 116 117 118 119
        phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(args.handle,
                                                              args.idesc.desc(),
                                                              args.wdesc.desc(),
                                                              args.cdesc.desc(),
                                                              args.odesc.desc(),
                                                              algo,
                                                              &workspace_size));
Q
qingqing01 已提交
120 121
    return workspace_size;
  }
122

Y
Yiqun Liu 已提交
123
 protected:
H
hong 已提交
124 125 126
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
127 128 129 130 131 132 133 134 135 136 137 138 139 140
  }

  // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
141
        phi::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
142 143 144 145 146 147 148 149
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            kNUM_CUDNN_FWD_ALGS,
            &actual_perf_count,
            perf_results.data()));
150 151 152 153 154
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
Y
Yiqun Liu 已提交
155 156 157 158
      VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
                                            perf_results,
                                            actual_perf_count,
                                            workspace_size_limit);
159
      // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
160 161
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
162 163 164 165 166 167
#else
      VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
                 "becasue the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
168
          phi::dynload::cudnnGetConvolutionForwardAlgorithm(
169 170 171 172 173
              args.handle,
              args.idesc.desc(),
              args.wdesc.desc(),
              args.cdesc.desc(),
              args.odesc.desc(),
174
              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
175 176
              workspace_size_limit,
              &(result.algo)));
177 178 179 180
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
181
        phi::dynload::cudnnGetConvolutionForwardAlgorithm(
182 183 184 185 186 187 188
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
            workspace_size_limit,
189 190
            &(result.algo)));
#endif
H
hong 已提交
191
    result.workspace_size = GetWorkspaceSize(args, result.algo);
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
209
          phi::dynload::cudnnFindConvolutionForwardAlgorithmEx(
210 211 212 213 214 215 216 217 218 219 220 221 222
              args.handle,
              args.idesc.desc(),
              args.x->data<T>(),
              args.wdesc.desc(),
              args.w->data<T>(),
              args.cdesc.desc(),
              args.odesc.desc(),
              const_cast<T*>(args.o->data<T>()),
              kNUM_CUDNN_FWD_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
223 224 225
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
226 227
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
228 229

    VLOG(4) << GetPerfResultString<PerfT>(
230 231 232 233 234 235
        "[Exhaustive Search] FwdAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
236

H
hong 已提交
237
    result.workspace_size = GetWorkspaceSize(args, result.algo);
238 239 240 241 242
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
243 244 245 246
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
        size_t workspace_size = 0;
247 248 249 250 251 252 253 254
        auto status = phi::dynload::cudnnGetConvolutionForwardWorkspaceSize(
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            static_cast<cudnnConvolutionFwdAlgo_t>(algo),
            &workspace_size);
255 256
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
257 258 259
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
260
      return max_workspace_size;
261 262 263 264
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
265 266
};

267 268 269 270 271 272
// cuDNN convolution backward data-algorithm searcher, consisting of three
// searching modes, namely: deterministic, heuristic, and exhaustive_search
// mode. Specially, there are 2 pattens of exhaustive search mode, one for
// HALF precision only, one for the rest.
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
Q
qingqing01 已提交
273
template <>
274
struct SearchAlgorithmBase<ConvKind::kBackwardData> {
275 276
  using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdDataAlgo_t;
277

Y
Yiqun Liu 已提交
278 279
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardData;
Q
qingqing01 已提交
280

281 282
  static const std::string GetPerfName() { return "ConvBackwardData"; }

283 284
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdDataAlgo_t algo) {
Q
qingqing01 已提交
285
    size_t workspace_size = 0;
286
    PADDLE_ENFORCE_GPU_SUCCESS(
287
        phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
288 289 290 291 292 293 294
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
295 296
    return workspace_size;
  }
297

Y
Yiqun Liu 已提交
298
 protected:
H
hong 已提交
299 300 301 302 303
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
304 305 306 307 308 309 310 311 312 313 314 315 316
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
317
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
318 319 320 321 322 323 324 325
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            kNUM_CUDNN_BWD_DATA_ALGS,
            &actual_perf_count,
            perf_results.data()));
326 327 328 329
    result.algo = perf_results[best_algo_idx].algo;

#if CUDNN_VERSION < 7500
    int stride_dim = args.x->dims().size() - 2;
330 331
    bool blacklist = std::any_of(args.s.begin(),
                                 args.s.begin() + stride_dim,
332 333 334 335 336 337 338 339 340 341 342 343
                                 [=](int n) { return n != 1; });
    if (blacklist && (perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
                      perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    }
#endif
    result.workspace_size = GetWorkspaceSize(args, result.algo);
    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
344 345
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
346 347 348 349 350 351
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
352
          phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
353 354 355 356 357
              args.handle,
              args.wdesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.idesc.desc(),
358
              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
359 360
              workspace_size_limit,
              &(result.algo)));
361 362 363 364
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
365
        phi::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
366 367 368 369 370
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
371
            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
372 373
            workspace_size_limit,
            &(result.algo)));
374
#endif
H
hong 已提交
375
    result.workspace_size = GetWorkspaceSize(args, result.algo);
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
393
          phi::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
394 395 396 397 398 399 400 401 402 403 404 405 406
              args.handle,
              args.wdesc.desc(),
              args.w->data<T>(),
              args.odesc.desc(),
              args.o->data<T>(),
              args.cdesc.desc(),
              args.idesc.desc(),
              const_cast<T*>(args.x->data<T>()),
              kNUM_CUDNN_BWD_DATA_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
407 408 409
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
410 411
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
412 413

    VLOG(4) << GetPerfResultString<PerfT>(
414 415 416 417 418 419
        "[Exhaustive Search] BwdDataAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
420

H
hong 已提交
421
    result.workspace_size = GetWorkspaceSize(args, result.algo);
422 423 424 425 426
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
427 428 429 430 431
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
432
            phi::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
433 434 435 436 437
                args.handle,
                args.wdesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.idesc.desc(),
438 439
                static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
                &workspace_size);
440 441
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
442 443 444
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
445
      return max_workspace_size;
446 447 448 449
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
450 451
};

452 453 454 455
// cuDNN convution backward filter-algorithm searcher, consisted of three
// algorithm searching modes, namely: deterministic, heuristic, and
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
Q
qingqing01 已提交
456
template <>
457
struct SearchAlgorithmBase<ConvKind::kBackwardFilter> {
458 459
  using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
460

Y
Yiqun Liu 已提交
461 462
  constexpr static phi::autotune::AlgorithmType kAlgoType =
      phi::autotune::AlgorithmType::kConvBackwardFilter;
Q
qingqing01 已提交
463

464 465
  static const std::string GetPerfName() { return "ConvBackwardFilter"; }

466 467
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdFilterAlgo_t algo) {
468
    phi::backends::gpu::CUDAGraphCaptureModeGuard guard;
Q
qingqing01 已提交
469
    size_t workspace_size = 0;
470
    PADDLE_ENFORCE_GPU_SUCCESS(
471
        phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
472 473 474 475 476 477 478
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
479 480
    return workspace_size;
  }
481

Y
Yiqun Liu 已提交
482
 protected:
H
hong 已提交
483 484 485 486 487
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
488 489 490 491 492 493 494 495 496 497 498 499 500
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
501
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
502 503 504 505 506 507 508 509
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            kNUM_CUDNN_BWD_FILTER_ALGS,
            &actual_perf_count,
            perf_results.data()));
510 511 512 513 514 515
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
516 517
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
518 519 520 521 522 523
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
524
          phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
525 526 527 528 529
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
530
              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
531 532
              workspace_size_limit,
              &(result.algo)));
533 534 535 536
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
537
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
538 539 540 541 542
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
543
            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
544 545
            workspace_size_limit,
            &(result.algo)));
546 547
#endif

H
hong 已提交
548
    result.workspace_size = GetWorkspaceSize(args, result.algo);
549 550 551 552 553 554 555 556 557 558 559 560
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    int returned_algo_count = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    auto workspace_handle = ctx.cudnn_workspace_handle();
561
    if (phi::backends::gpu::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
562 563 564 565 566 567 568
      size_t max_workspace_size =
          GetMaxWorkspaceSize(args, workspace_size_limit);
      VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
              << " MB";

      auto cudnn_find_func = [&](void* workspace_ptr) {
        PADDLE_ENFORCE_GPU_SUCCESS(
569
            phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
570 571 572 573 574 575 576 577 578 579 580 581 582
                args.handle,
                args.idesc.desc(),
                args.x->data<T>(),
                args.odesc.desc(),
                args.o->data<T>(),
                args.cdesc.desc(),
                args.wdesc.desc(),
                const_cast<T*>(args.w->data<T>()),
                kNUM_CUDNN_BWD_FILTER_ALGS,
                &returned_algo_count,
                perf_results.data(),
                workspace_ptr,
                max_workspace_size));
583
      };
584 585
      workspace_handle.RunFuncSync(
          cudnn_find_func, max_workspace_size, UseFixedWorkspace());
586 587

      VLOG(4) << GetPerfResultString<PerfT>(
588 589 590 591 592 593
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          returned_algo_count,
          workspace_size_limit);
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
594 595 596 597
    } else {
      int max_algos = GetAlgorithmMaxCount(args.handle);
      std::vector<PerfT> perf_results(max_algos);
      PADDLE_ENFORCE_GPU_SUCCESS(
598
          phi::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
599 600 601 602 603 604 605 606
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
              perf_results.size(),
              &returned_algo_count,
              perf_results.data()));
607 608 609
      perf_results.resize(returned_algo_count);

      VLOG(4) << GetPerfResultString<PerfT>(
610 611 612 613
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          perf_results.size(),
          workspace_size_limit);
614 615
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
616 617
    }

H
hong 已提交
618
    result.workspace_size = GetWorkspaceSize(args, result.algo);
619 620 621 622 623 624 625
    return result;
  }

  static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
#if CUDNN_VERSION_MIN(7, 0, 1)
    int max_algos = 0;
    auto status =
626
        phi::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
627 628 629 630 631 632 633 634 635 636 637 638
            handle, &max_algos);
    if (status == gpuSuccess) {
      VLOG(5) << "[BackwardFilter] max_algos: predefined="
              << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
      return max_algos;
    }
#endif
    return kNUM_CUDNN_BWD_FILTER_ALGS;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
639 640 641 642 643
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
644
            phi::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
645 646 647 648 649
                args.handle,
                args.idesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.wdesc.desc(),
650 651
                static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
                &workspace_size);
652 653
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
654 655 656
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
657
      return max_workspace_size;
658 659 660 661
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
662 663
};

664 665 666
template <ConvKind CK>
struct SearchAlgorithm : public SearchAlgorithmBase<CK> {
  using AlgoT = typename SearchAlgorithmBase<CK>::AlgoT;
Y
Yiqun Liu 已提交
667 668

  template <typename T>
669 670
  static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
                                  const ConvArgs& args,
Y
Yiqun Liu 已提交
671 672
                                  bool exhaustive_search,
                                  bool deterministic,
673
                                  bool enable_autotune = true) {
Y
Yiqun Liu 已提交
674
    SearchResult<AlgoT> result;
675
    bool use_autotune = false;
676
    auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
Y
Yiqun Liu 已提交
677 678 679
    SetConvMathType(ctx, dtype, args.cdesc);

    if (deterministic) {
680
      result = SearchAlgorithmBase<CK>::FindAlgoDeterministic(args);
Y
Yiqun Liu 已提交
681 682
    } else {
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
683
      // 2. Once turning on auto-tune, run heuristic (default) before
Y
Yiqun Liu 已提交
684
      //    auto-tune process, run exhaustive_search during mentioned process.
685
      //    Auto tune is only enabled between specified range.
Y
Yiqun Liu 已提交
686 687
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
688
      auto key = args.ConvertToConvCacheKey<T>();
Y
Yiqun Liu 已提交
689
      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
690
          SearchAlgorithmBase<CK>::kAlgoType);
691 692
      bool find_in_cache = cache.Find(key);
      if (find_in_cache) {
Y
Yiqun Liu 已提交
693 694 695
        auto t = cache.Get(key);
        result.algo = static_cast<AlgoT>(t.algo);
        result.workspace_size = t.workspace_size;
696 697 698 699 700 701 702
        result.exhaustive_search = t.exhaustive_search;
      }
      if (!result.exhaustive_search) {
        // In conv2d_tranpose, enable_autotune is set to false because some
        // algorithm picked by exhaustive search method produce wrong result.
        use_autotune = enable_autotune &&
                       phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
Y
Yiqun Liu 已提交
703
        if (exhaustive_search || use_autotune) {
704 705
          // Once autotune is enabled, the autotuned result can rewrite the
          // previous result in cache found by heuristic method.
Y
Yiqun Liu 已提交
706
          result =
707
              SearchAlgorithmBase<CK>::template FindAlgoExhaustiveSearch<T>(
Y
Yiqun Liu 已提交
708
                  args, ctx);
709 710 711 712 713
          cache.Set(key,
                    phi::autotune::ConvAutoTuneResult(
                        static_cast<int64_t>(result.algo),
                        result.workspace_size,
                        true));
714
        } else if (!find_in_cache) {
715
          result = SearchAlgorithmBase<CK>::FindAlgoHeuristic(args, ctx);
716 717 718 719 720
          cache.Set(key,
                    phi::autotune::ConvAutoTuneResult(
                        static_cast<int64_t>(result.algo),
                        result.workspace_size,
                        false));
Y
Yiqun Liu 已提交
721 722 723
        }
      }
    }
724
    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<CK>::GetPerfName()
725 726
            << "] exhaustive_search=" << exhaustive_search
            << ", use_autotune=" << use_autotune
Y
Yiqun Liu 已提交
727 728 729 730 731 732
            << ", deterministic=" << deterministic
            << ", choose algo=" << result.algo
            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
    return result;
  }

733 734 735
  static void SetConvMathType(
      const phi::GPUContext& ctx,
      cudnnDataType_t dtype,
736
      const phi::backends::gpu::ConvolutionDescriptor& cdesc) {
Y
Yiqun Liu 已提交
737 738
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
    if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
739
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
740 741 742 743 744 745 746
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
      VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
    } else if (ctx.GetComputeCapability() >= 80 &&
               dtype == CUDNN_DATA_BFLOAT16) {
      VLOG(5) << "Enable Tensor Core for BFLOAT16";
747
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
748 749 750 751
          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif  // CUDNN_VERSION_MIN(8, 1, 0)
    } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
      VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
752
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
753 754 755
          cdesc.desc(), CUDNN_FMA_MATH));
#endif  // CUDA_VERSION >= 11000
    } else {
756
      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetConvolutionMathType(
Y
Yiqun Liu 已提交
757 758 759 760 761 762
          cdesc.desc(), CUDNN_DEFAULT_MATH));
    }
#endif
  }
};

763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896
template <typename T, ConvKind CK>
struct ConvRunner {};

template <typename T>
struct ConvRunner<T, ConvKind::kForward> {
  static void Apply(
      const phi::GPUContext& ctx,
      const ConvArgs& args,
      const SearchResult<cudnnConvolutionFwdAlgo_t>& search_result,
      const T* input_ptr,
      const T* filter_ptr,
      T* output_ptr,
      int groups,
      int group_offset_in,
      int group_offset_filter,
      int group_offset_out,
      size_t workspace_size,
      phi::DnnWorkspaceHandle* workspace_handle,
      bool use_addto = false) {
    ScalingParamType<T> alpha = 1.0f;
    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;

    auto cudnn_handle = ctx.cudnn_handle();
    for (int i = 0; i < groups; i++) {
      workspace_handle->RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnConvolutionForward(
                cudnn_handle,
                &alpha,
                args.idesc.desc(),
                input_ptr + i * group_offset_in,
                args.wdesc.desc(),
                filter_ptr + i * group_offset_filter,
                args.cdesc.desc(),
                search_result.algo,
                workspace_ptr,
                workspace_size,
                &beta,
                args.odesc.desc(),
                output_ptr + i * group_offset_out));
          },
          workspace_size);
    }
  }
};

template <typename T>
struct ConvRunner<T, ConvKind::kBackwardData> {
  static void Apply(
      const phi::GPUContext& ctx,
      const ConvArgs& args,
      const SearchResult<cudnnConvolutionBwdDataAlgo_t>& search_result,
      const T* output_grad_ptr,
      const T* filter_ptr,
      T* input_grad_ptr,
      int groups,
      int group_offset_in,
      int group_offset_filter,
      int group_offset_out,
      size_t workspace_size,
      phi::DnnWorkspaceHandle* workspace_handle,
      bool use_addto = false) {
    ScalingParamType<T> alpha = 1.0f;
    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;

    auto cudnn_handle = ctx.cudnn_handle();
    for (int i = 0; i < groups; i++) {
      workspace_handle->RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
                phi::dynload::cudnnConvolutionBackwardData(
                    cudnn_handle,
                    &alpha,
                    args.wdesc.desc(),
                    filter_ptr + i * group_offset_filter,
                    args.odesc.desc(),
                    output_grad_ptr + i * group_offset_out,
                    args.cdesc.desc(),
                    search_result.algo,
                    workspace_ptr,
                    workspace_size,
                    &beta,
                    args.idesc.desc(),
                    input_grad_ptr + i * group_offset_in));
          },
          workspace_size);
    }
  }
};

template <typename T>
struct ConvRunner<T, ConvKind::kBackwardFilter> {
  static void Apply(
      const phi::GPUContext& ctx,
      const ConvArgs& args,
      const SearchResult<cudnnConvolutionBwdFilterAlgo_t>& search_result,
      const T* output_grad_ptr,
      const T* input_ptr,
      T* filter_grad_ptr,
      int groups,
      int group_offset_in,
      int group_offset_filter,
      int group_offset_out,
      size_t workspace_size,
      phi::DnnWorkspaceHandle* workspace_handle,
      bool use_addto = false) {
    ScalingParamType<T> alpha = 1.0f;
    ScalingParamType<T> beta = use_addto ? 1.0f : 0.0f;

    auto cudnn_handle = ctx.cudnn_handle();
    for (int i = 0; i < groups; i++) {
      workspace_handle->RunFunc(
          [&](void* workspace_ptr) {
            PADDLE_ENFORCE_GPU_SUCCESS(
                phi::dynload::cudnnConvolutionBackwardFilter(
                    cudnn_handle,
                    &alpha,
                    args.idesc.desc(),
                    input_ptr + i * group_offset_in,
                    args.odesc.desc(),
                    output_grad_ptr + i * group_offset_out,
                    args.cdesc.desc(),
                    search_result.algo,
                    workspace_ptr,
                    workspace_size,
                    &beta,
                    args.wdesc.desc(),
                    filter_grad_ptr + i * group_offset_filter));
          },
          workspace_size);
    }
  }
};

897
}  // namespace phi