conv_cudnn_helper.h 33.6 KB
Newer Older
Q
qingqing01 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

17
#include "paddle/fluid/framework/eigen.h"
18
#include "paddle/fluid/operators/conv_base_helper.h"
19
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
20
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
21 22
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
23
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
24

Q
qingqing01 已提交
25 26 27
namespace paddle {
namespace operators {

28
using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
29 30

template <typename DeviceContext, typename T, size_t D>
H
hong 已提交
31
static void RemovePaddingSlice(const phi::GPUContext& context,
32 33
                               const Tensor* input,
                               Tensor* out,
34 35
                               const std::vector<int>& starts,
                               const std::vector<int>& axes) {
H
hong 已提交
36
  auto& place = *context.eigen_device();
37 38
  auto in_dims = input->dims();
  auto new_out_dims = out->dims();
39 40
  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
41 42 43 44 45 46
  for (size_t i = 0; i < D; ++i) {
    offsets[i] = 0;
    extents[i] = new_out_dims[i];
  }

  for (size_t i = 0; i < axes.size(); ++i) {
47
    int start = starts[i];
48 49 50 51 52 53
    if (start < 0) {
      start = (start + in_dims[axes[i]]);
    }
    start = std::max(start, 0);
    offsets[axes[i]] = start;
  }
54

55 56 57 58 59 60
  auto in_t =
      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
          *input);
  auto out_t =
      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
          *out, new_out_dims);
61 62 63

  phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
      place, out_t, in_t, offsets, extents);
64 65
}

66 67
static inline double ToMegaBytes(size_t bytes) {
  return static_cast<double>(bytes) / (1 << 20);
68 69
}

70 71
static inline bool UseFixedWorkspace() {
  return FLAGS_conv_workspace_size_limit >= 0;
72 73
}

74 75
static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
  if (!use_fixed_workspace) {
76
    int device_id = platform::GetCurrentDeviceId();
77 78 79 80
    int64_t allocated =
        memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
    int64_t reserved =
        memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
81 82 83
    int64_t availble = platform::GpuAvailableMemToAlloc();
    VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
            << " MB, reserved=" << ToMegaBytes(reserved)
84 85
            << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
    return std::max(availble, reserved - allocated);
86 87
  } else {
    return FLAGS_conv_workspace_size_limit * 1024 * 1024;
88 89 90
  }
}

91 92 93
template <typename PerfT>
std::string GetPerfResultString(std::string prefix,
                                const std::vector<PerfT>& perf_results,
94 95
                                int actual_algo_count,
                                size_t workspace_limit) {
96 97 98 99 100 101 102 103 104 105
  std::ostringstream out;
  out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
      << " MB):\n";
  for (int i = 0; i < actual_algo_count; ++i) {
    const auto& result = perf_results[i];
    auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
    out << "  algo=" << result.algo << ": tensor_core=" << math_type_str
        << ", time=" << result.time
        << " ms, memory=" << ToMegaBytes(result.memory)
        << " MB, status=" << result.status << "\n";
106
  }
107 108
  return out.str();
}
109

110 111
// Choose an algorithm which has the minimize time cost and less memory.
// NOTE: perf_results is ordered by time.
112 113 114
template <typename PerfT, typename AlgoT>
void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
                           size_t workspace_limit,
115 116
                           SearchResult<AlgoT>* search_result) {
  int best_algo_idx = -1;
117 118
  for (size_t i = 0; i < perf_results.size(); ++i) {
    auto result = perf_results[i];
119
    if (result.status == CUDNN_STATUS_SUCCESS &&
120
        result.memory < workspace_limit) {
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
      if (best_algo_idx == -1) {
        // The algorithm which has minimize time cost and need a workspace_size
        // fitting the workspace_limit constraint.
        best_algo_idx = i;
        // Each perf_results[i].time is set to be -1 in heuristic search.
        if (perf_results[best_algo_idx].time < 0) {
          break;
        }
      } else {
        float best_algo_time = perf_results[best_algo_idx].time;
        if ((result.time - best_algo_time) / best_algo_time < 0.01) {
          best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
                              ? i
                              : best_algo_idx;
          break;
        }
      }
138 139
    }
  }
140 141 142 143 144 145 146 147
  if (best_algo_idx != -1) {
    search_result->algo = perf_results[best_algo_idx].algo;
    search_result->time = perf_results[best_algo_idx].time;
    search_result->workspace_size = perf_results[best_algo_idx].memory;
  } else {
    VLOG(3) << "Can not find an algorithm that requires memory < "
            << ToMegaBytes(workspace_limit) << " MB";
  }
148 149
}

150 151
static void SetConvMathType(const phi::GPUContext& ctx,
                            cudnnDataType_t dtype,
152 153
                            const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
154
  if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
155
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
156 157 158 159
        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
    VLOG(5) << "use cudnn_tensor_op_math";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
160
  } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
161
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
162 163 164
        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif  // CUDNN_VERSION_MIN(8, 1, 0)
  } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
165
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
166 167 168
        cdesc.desc(), CUDNN_FMA_MATH));
#endif  // CUDA_VERSION >= 11000
  } else {
169
    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
170 171 172 173 174 175
        cdesc.desc(), CUDNN_DEFAULT_MATH));
    VLOG(5) << "NOT use cudnn_tensor_op_math";
  }
#endif
}

176 177 178 179
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
Q
qingqing01 已提交
180 181
template <>
struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
182 183
  using PerfT = cudnnConvolutionFwdAlgoPerf_t;
  using AlgoT = cudnnConvolutionFwdAlgo_t;
Q
qingqing01 已提交
184 185

  template <typename T>
186 187
  static SearchResult<AlgoT> Find(const ConvArgs& args,
                                  bool exhaustive_search,
188 189 190
                                  bool deterministic,
                                  const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
Q
qingqing01 已提交
191
    auto dtype = platform::CudnnDataType<T>::type;
192
    SetConvMathType(ctx, dtype, args.cdesc);
193

194
    if (deterministic) {
H
hong 已提交
195
      result = FindAlgoDeterministic(args);
Q
qingqing01 已提交
196
    } else {
197 198 199 200 201
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
      // 2. Once turning on auto-tune, runn heuristic search(default) before
      //    auto-tune process, run exhaustive_search during mentioned process.
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
H
hong 已提交
202
      auto key = args.Convert2ConvCacheKey<T>();
203 204
      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
      if (cache.Find(key)) {
H
hong 已提交
205 206 207
        auto t = cache.Get(key);
        result.algo = static_cast<AlgoT>(t.algo);
        result.workspace_size = t.workspace_size;
208 209 210 211 212 213 214 215
      } else {
        bool use_autotune =
            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
        if (exhaustive_search || use_autotune) {
          result = FindAlgoExhaustiveSearch<T>(args, ctx);
        } else {
          result = FindAlgoHeuristic(args, ctx);
        }
H
hong 已提交
216 217 218
        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
                                    result.workspace_size);
        cache.Set(key, node);
219
      }
Q
qingqing01 已提交
220
    }
221 222
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
H
hong 已提交
223 224
            << ", choose algo=" << result.algo
            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
225
    return result;
Q
qingqing01 已提交
226 227
  }

228 229
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionFwdAlgo_t algo) {
Q
qingqing01 已提交
230
    size_t workspace_size = 0;
231
    PADDLE_ENFORCE_GPU_SUCCESS(
232
        platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
233 234 235 236 237 238 239
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
240 241
    return workspace_size;
  }
242 243

 private:
H
hong 已提交
244 245 246
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
    return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
  }

  // Heuristic search mode, calling the cudnnGetXxxAlgorithm.
  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
262 263 264 265 266 267 268 269
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            kNUM_CUDNN_FWD_ALGS,
            &actual_perf_count,
            perf_results.data()));
270 271 272 273 274 275
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
276 277
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
278 279 280 281 282 283 284
#else
      VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
                 "becasue the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
285 286 287 288 289
              args.handle,
              args.idesc.desc(),
              args.wdesc.desc(),
              args.cdesc.desc(),
              args.odesc.desc(),
290
              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
291 292
              workspace_size_limit,
              &(result.algo)));
293 294 295 296 297
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cudnnGetConvolutionForwardAlgorithm(
298 299 300 301 302 303 304
            args.handle,
            args.idesc.desc(),
            args.wdesc.desc(),
            args.cdesc.desc(),
            args.odesc.desc(),
            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
            workspace_size_limit,
305 306
            &(result.algo)));
#endif
H
hong 已提交
307
    result.workspace_size = GetWorkspaceSize(args, result.algo);
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
326 327 328 329 330 331 332 333 334 335 336 337 338
              args.handle,
              args.idesc.desc(),
              args.x->data<T>(),
              args.wdesc.desc(),
              args.w->data<T>(),
              args.cdesc.desc(),
              args.odesc.desc(),
              const_cast<T*>(args.o->data<T>()),
              kNUM_CUDNN_FWD_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
339 340 341
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
342 343
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
344 345

    VLOG(4) << GetPerfResultString<PerfT>(
346 347 348 349 350 351
        "[Exhaustive Search] FwdAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
352

H
hong 已提交
353
    result.workspace_size = GetWorkspaceSize(args, result.algo);
354 355 356 357 358
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
359 360 361 362 363 364
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
            platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
365 366 367 368 369 370 371
                args.handle,
                args.idesc.desc(),
                args.wdesc.desc(),
                args.cdesc.desc(),
                args.odesc.desc(),
                static_cast<cudnnConvolutionFwdAlgo_t>(algo),
                &workspace_size);
372 373
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
374 375 376
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
377
      return max_workspace_size;
378 379 380 381
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
382 383
};

384 385 386 387 388 389
// cuDNN convolution backward data-algorithm searcher, consisting of three
// searching modes, namely: deterministic, heuristic, and exhaustive_search
// mode. Specially, there are 2 pattens of exhaustive search mode, one for
// HALF precision only, one for the rest.
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
Q
qingqing01 已提交
390 391
template <>
struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
392 393
  using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdDataAlgo_t;
Q
qingqing01 已提交
394 395

  template <typename T>
396 397
  static SearchResult<AlgoT> Find(const ConvArgs& args,
                                  bool exhaustive_search,
398 399 400
                                  bool deterministic,
                                  const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
Q
qingqing01 已提交
401
    auto dtype = platform::CudnnDataType<T>::type;
402
    SetConvMathType(ctx, dtype, args.cdesc);
403

404
    if (deterministic) {
H
hong 已提交
405
      result = FindAlgoDeterministic(args);
Q
qingqing01 已提交
406
    } else {
407 408 409 410 411
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
      // 2. Once turning on auto-tune, runn heuristic search(default) before
      //    auto-tune process, run exhaustive_search during mentioned process.
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
H
hong 已提交
412
      auto key = args.Convert2ConvCacheKey<T>();
413 414 415
      auto& cache =
          phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
      if (cache.Find(key)) {
H
hong 已提交
416 417 418
        auto t = cache.Get(key);
        result.algo = static_cast<AlgoT>(t.algo);
        result.workspace_size = t.workspace_size;
419 420 421 422 423 424 425 426
      } else {
        bool use_autotune =
            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
        if (exhaustive_search || use_autotune) {
          result = FindAlgoExhaustiveSearch<T>(args, ctx);
        } else {
          result = FindAlgoHeuristic(args, ctx);
        }
H
hong 已提交
427 428 429
        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
                                    result.workspace_size);
        cache.Set(key, node);
430
      }
Q
qingqing01 已提交
431
    }
432 433
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
H
hong 已提交
434 435
            << ", choose algo=" << result.algo
            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
436
    return result;
Q
qingqing01 已提交
437 438
  }

439 440
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdDataAlgo_t algo) {
Q
qingqing01 已提交
441
    size_t workspace_size = 0;
442
    PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
443
        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
444 445 446 447 448 449 450
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
451 452
    return workspace_size;
  }
453 454

 private:
H
hong 已提交
455 456 457 458 459
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, -1.0, workspace_size);
460 461 462 463 464 465 466 467 468 469 470 471 472 473
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
474 475 476 477 478 479 480 481
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
            kNUM_CUDNN_BWD_DATA_ALGS,
            &actual_perf_count,
            perf_results.data()));
482 483 484 485
    result.algo = perf_results[best_algo_idx].algo;

#if CUDNN_VERSION < 7500
    int stride_dim = args.x->dims().size() - 2;
486 487
    bool blacklist = std::any_of(args.s.begin(),
                                 args.s.begin() + stride_dim,
488 489 490 491 492 493 494 495 496 497 498 499
                                 [=](int n) { return n != 1; });
    if (blacklist && (perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
                      perf_results[best_algo_idx].algo ==
                          CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
      result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
    }
#endif
    result.workspace_size = GetWorkspaceSize(args, result.algo);
    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
500 501
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
502 503 504 505 506 507 508
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
509 510 511 512 513
              args.handle,
              args.wdesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.idesc.desc(),
514
              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
515 516
              workspace_size_limit,
              &(result.algo)));
517 518 519 520 521
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
522 523 524 525 526
            args.handle,
            args.wdesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.idesc.desc(),
527
            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
528 529
            workspace_size_limit,
            &(result.algo)));
530
#endif
H
hong 已提交
531
    result.workspace_size = GetWorkspaceSize(args, result.algo);
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
    VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
            << " MB";

    int returned_algo_count;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
    auto cudnn_find_func = [&](void* workspace_ptr) {
      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
550 551 552 553 554 555 556 557 558 559 560 561 562
              args.handle,
              args.wdesc.desc(),
              args.w->data<T>(),
              args.odesc.desc(),
              args.o->data<T>(),
              args.cdesc.desc(),
              args.idesc.desc(),
              const_cast<T*>(args.x->data<T>()),
              kNUM_CUDNN_BWD_DATA_ALGS,
              &returned_algo_count,
              perf_results.data(),
              workspace_ptr,
              max_workspace_size));
563 564 565
    };

    auto workspace_handle = ctx.cudnn_workspace_handle();
566 567
    workspace_handle.RunFuncSync(
        cudnn_find_func, max_workspace_size, UseFixedWorkspace());
568 569

    VLOG(4) << GetPerfResultString<PerfT>(
570 571 572 573 574 575
        "[Exhaustive Search] BwdDataAlgo Perf result",
        perf_results,
        returned_algo_count,
        workspace_size_limit);
    ChooseAlgoByWorkspace<PerfT, AlgoT>(
        perf_results, workspace_size_limit, &result);
576

H
hong 已提交
577
    result.workspace_size = GetWorkspaceSize(args, result.algo);
578 579 580 581 582
    return result;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
583 584 585 586 587 588
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
            platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
589 590 591 592 593
                args.handle,
                args.wdesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.idesc.desc(),
594 595
                static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
                &workspace_size);
596 597
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
598 599 600
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
601
      return max_workspace_size;
602 603 604 605
    } else {
      return workspace_size_limit;
    }
  }
Q
qingqing01 已提交
606 607
};

608 609 610 611
// cuDNN convution backward filter-algorithm searcher, consisted of three
// algorithm searching modes, namely: deterministic, heuristic, and
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
Q
qingqing01 已提交
612 613
template <>
struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
614 615
  using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
  using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
Q
qingqing01 已提交
616 617

  template <typename T>
618 619
  static SearchResult<AlgoT> Find(const ConvArgs& args,
                                  bool exhaustive_search,
620 621
                                  bool deterministic,
                                  const phi::GPUContext& ctx) {
622
    platform::CUDAGraphCaptureModeGuard guard;
623
    SearchResult<AlgoT> result;
Q
qingqing01 已提交
624
    auto dtype = platform::CudnnDataType<T>::type;
625
    SetConvMathType(ctx, dtype, args.cdesc);
Q
qingqing01 已提交
626

627
    if (deterministic) {
H
hong 已提交
628
      result = FindAlgoDeterministic(args);
Q
qingqing01 已提交
629
    } else {
630 631 632 633 634
      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
      // 2. Once turning on auto-tune, runn heuristic search(default) before
      //    auto-tune process, run exhaustive_search during mentioned process.
      // 3. After auto-tune process, run cached algorithm if cached, run
      //    default mode for the rest.
H
hong 已提交
635
      auto key = args.Convert2ConvCacheKey<T>();
636 637 638
      auto& cache =
          phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
      if (cache.Find(key)) {
H
hong 已提交
639 640 641
        auto t = cache.Get(key);
        result.algo = static_cast<AlgoT>(t.algo);
        result.workspace_size = t.workspace_size;
642
      } else {
643 644 645 646 647 648 649
        bool use_autotune =
            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
        if (exhaustive_search || use_autotune) {
          result = FindAlgoExhaustiveSearch<T>(args, ctx);
        } else {
          result = FindAlgoHeuristic(args, ctx);
        }
H
hong 已提交
650 651 652
        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
                                    result.workspace_size);
        cache.Set(key, node);
653
      }
Q
qingqing01 已提交
654
    }
655 656
    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
            << ", deterministic=" << deterministic
H
hong 已提交
657 658
            << ", choose algo=" << result.algo
            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
659
    return result;
Q
qingqing01 已提交
660 661
  }

662 663
  static size_t GetWorkspaceSize(const ConvArgs& args,
                                 cudnnConvolutionBwdFilterAlgo_t algo) {
664
    platform::CUDAGraphCaptureModeGuard guard;
Q
qingqing01 已提交
665
    size_t workspace_size = 0;
666
    PADDLE_ENFORCE_GPU_SUCCESS(
Q
qingqing01 已提交
667
        platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
668 669 670 671 672 673 674
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            algo,
            &workspace_size));
Q
qingqing01 已提交
675 676
    return workspace_size;
  }
677 678

 private:
H
hong 已提交
679 680 681 682 683
  static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
    auto workspace_size =
        GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
    return SearchResult<AlgoT>(
        CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, -1.0, workspace_size);
684 685 686 687 688 689 690 691 692 693 694 695 696 697
  }

  static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
                                               const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());

#if CUDNN_VERSION >= 7001
    int actual_perf_count;
    int best_algo_idx = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
698 699 700 701 702 703 704 705
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
            kNUM_CUDNN_BWD_FILTER_ALGS,
            &actual_perf_count,
            perf_results.data()));
706 707 708 709 710 711
    result.algo = perf_results[best_algo_idx].algo;
    result.workspace_size = perf_results[best_algo_idx].memory;

    if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
      // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
712 713
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
714 715 716 717 718 719 720
#else
      VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
                 "the workspace size request("
              << result.workspace_size << ") exceeds the limit("
              << workspace_size_limit << ")";
      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
721 722 723 724 725
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
726
              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
727 728
              workspace_size_limit,
              &(result.algo)));
729 730 731 732 733
#endif
    }
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
        platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
734 735 736 737 738
            args.handle,
            args.idesc.desc(),
            args.odesc.desc(),
            args.cdesc.desc(),
            args.wdesc.desc(),
739
            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
740 741
            workspace_size_limit,
            &(result.algo)));
742 743
#endif

H
hong 已提交
744
    result.workspace_size = GetWorkspaceSize(args, result.algo);
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765
    return result;
  }

  template <typename T>
  static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
      const ConvArgs& args, const phi::GPUContext& ctx) {
    SearchResult<AlgoT> result;
    int returned_algo_count = 0;
    std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
    size_t workspace_size_limit =
        CalcWorkspaceLimitInBytes(UseFixedWorkspace());
    auto workspace_handle = ctx.cudnn_workspace_handle();
    if (platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
      size_t max_workspace_size =
          GetMaxWorkspaceSize(args, workspace_size_limit);
      VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
              << " MB";

      auto cudnn_find_func = [&](void* workspace_ptr) {
        PADDLE_ENFORCE_GPU_SUCCESS(
            platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
766 767 768 769 770 771 772 773 774 775 776 777 778
                args.handle,
                args.idesc.desc(),
                args.x->data<T>(),
                args.odesc.desc(),
                args.o->data<T>(),
                args.cdesc.desc(),
                args.wdesc.desc(),
                const_cast<T*>(args.w->data<T>()),
                kNUM_CUDNN_BWD_FILTER_ALGS,
                &returned_algo_count,
                perf_results.data(),
                workspace_ptr,
                max_workspace_size));
779
      };
780 781
      workspace_handle.RunFuncSync(
          cudnn_find_func, max_workspace_size, UseFixedWorkspace());
782 783

      VLOG(4) << GetPerfResultString<PerfT>(
784 785 786 787 788 789
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          returned_algo_count,
          workspace_size_limit);
      ChooseAlgoByWorkspace<PerfT, AlgoT>(
          perf_results, workspace_size_limit, &result);
790 791 792 793 794
    } else {
      int max_algos = GetAlgorithmMaxCount(args.handle);
      std::vector<PerfT> perf_results(max_algos);
      PADDLE_ENFORCE_GPU_SUCCESS(
          platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
795 796 797 798 799 800 801 802
              args.handle,
              args.idesc.desc(),
              args.odesc.desc(),
              args.cdesc.desc(),
              args.wdesc.desc(),
              perf_results.size(),
              &returned_algo_count,
              perf_results.data()));
803 804 805
      perf_results.resize(returned_algo_count);

      VLOG(4) << GetPerfResultString<PerfT>(
806 807 808 809
          "[Exhaustive Search] BwdFilterAlgo Perf result",
          perf_results,
          perf_results.size(),
          workspace_size_limit);
810 811 812
      ChooseAlgo(perf_results, workspace_size_limit, &result);
    }

H
hong 已提交
813
    result.workspace_size = GetWorkspaceSize(args, result.algo);
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
    return result;
  }

  static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
#if CUDNN_VERSION_MIN(7, 0, 1)
    int max_algos = 0;
    auto status =
        platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
            handle, &max_algos);
    if (status == gpuSuccess) {
      VLOG(5) << "[BackwardFilter] max_algos: predefined="
              << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
      return max_algos;
    }
#endif
    return kNUM_CUDNN_BWD_FILTER_ALGS;
  }

  static size_t GetMaxWorkspaceSize(const ConvArgs& args,
                                    size_t workspace_size_limit) {
834 835 836 837 838 839
    if (!UseFixedWorkspace()) {
      size_t max_workspace_size = 0;
      for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
        size_t workspace_size = 0;
        auto status =
            platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
840 841 842 843 844
                args.handle,
                args.idesc.desc(),
                args.odesc.desc(),
                args.cdesc.desc(),
                args.wdesc.desc(),
845 846
                static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
                &workspace_size);
847 848
        if (status == CUDNN_STATUS_SUCCESS &&
            workspace_size <= workspace_size_limit) {
849 850 851
          max_workspace_size = std::max(workspace_size, max_workspace_size);
        }
      }
852
      return max_workspace_size;
853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892
    } else {
      return workspace_size_limit;
    }
  }

  static void ChooseAlgo(const std::vector<PerfT>& perf_results,
                         size_t workspace_limit,
                         SearchResult<AlgoT>* algo_result) {
    for (size_t i = 0; i != perf_results.size(); ++i) {
      const auto& result = perf_results[i];
      if (result.status == CUDNN_STATUS_SUCCESS &&
          (result.memory <= workspace_limit)) {
        if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
            (i != perf_results.size() - 1)) {
          const auto& next_result = perf_results[i + 1];
          if (next_result.status == CUDNN_STATUS_SUCCESS &&
              next_result.algo == result.algo &&
              next_result.memory == result.memory &&
              next_result.mathType != CUDNN_TENSOR_OP_MATH &&
              next_result.time < 1.01 * result.time) {
            // Skip over this result- it's not really a Tensor Core algo.
            // Because it is only 1% performance difference.
            // Prefer to choose the next equivalent non-Tensor Core algo.
            continue;
          }
        }
        algo_result->algo = result.algo;
        algo_result->time = result.time;
        auto math_type_str = "0";
        if (result.mathType == CUDNN_TENSOR_OP_MATH) {
          math_type_str = "1";
        }
        VLOG(3) << "    choose algo: " << result.algo
                << ", TC: " << math_type_str << ", time: " << result.time
                << " ms, wksp = " << result.memory
                << ", status = " << result.status;
        break;
      }
    }
  }
Q
qingqing01 已提交
893 894 895 896
};

}  // namespace operators
}  // namespace paddle