change filter_grad algo

8ebcf948 · zhangting2020 · c67c3916 · 8ebcf948 · 8ebcf948
隐藏空白更改
内联并排

Showing with 108 addition and 1 deletion

paddle/fluid/operators/conv_cudnn_helper.h paddle/fluid/operators/conv_cudnn_helper.h +105 -0

paddle/fluid/platform/dynload/cudnn.h paddle/fluid/platform/dynload/cudnn.h +3 -1

未找到文件。
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <array>
 #include <memory>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
@@ -90,6 +91,75 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
  return out;
 }

+inline int MaxBackwardFilterAlgos(cudnnHandle_t cudnn_handle) {
+  int max_algos = 0;
+#if CUDNN_VERSION_MIN(7, 0, 1)
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+          cudnn_handle, &max_algos));
+#endif
+  return max_algos;
+}
+
+template <typename PerfType, typename AlgoType>
+void AlgoFinalSelect(const std::vector<PerfType>& perf_results,
+                     std::string kernel_name, int32_t algo_preference,
+                     size_t workspace_byte,
+                     cudnnConvolutionBwdFilterAlgo_t* algo,
+                     bool deterministic) {
+  // Determine the fastest acceptable algo that matches the algo_preference (-1
+  // = any),
+  // regardless of mathType.
+
+  VLOG(3) << "=========Full results of algo=========" << kernel_name << ":";
+  for (const auto& result : perf_results) {
+    auto math_type_str = "-";
+    if (result.mathType == CUDNN_TENSOR_OP_MATH) {
+      math_type_str = "+";
+    }
+
+    VLOG(3) << "    algo: " << result.algo << ", TC" << math_type_str
+            << ", time: " << result.time << " ms"
+            << ", wksp = " << result.memory << ", status = " << result.status;
+  }
+
+  for (decltype(perf_results.size()) i = 0; i != perf_results.size(); ++i) {
+    const auto& result = perf_results[i];
+    bool algo_is_tensor_core = false;
+    algo_is_tensor_core = result.mathType == CUDNN_TENSOR_OP_MATH;
+    bool algo_exclusion = 0;
+    if (result.status == CUDNN_STATUS_SUCCESS &&
+        (!deterministic ||
+         result.determinism == cudnnDeterminism_t::CUDNN_DETERMINISTIC) &&
+        (result.memory <= workspace_byte) &&
+        (algo_preference == -1 || algo_preference == result.algo) &&
+        !algo_exclusion) {
+      if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
+          (i != perf_results.size() - 1)) {
+        const auto& next_result = perf_results[i + 1];
+        if (next_result.status == CUDNN_STATUS_SUCCESS &&
+            next_result.algo == result.algo &&
+            next_result.memory == result.memory &&
+            next_result.mathType != CUDNN_TENSOR_OP_MATH &&
+            next_result.time < 1.01 * result.time) {
+          // Skip over this result- it's not really a Tensor Core algo.
+          // Prefer instead the next equivalent non-Tensor Core algo.
+          continue;
+        }
+      }
+      *algo = result.algo;
+      auto math_type_str = "-";
+      if (result.mathType == CUDNN_TENSOR_OP_MATH) {
+        math_type_str = "+";
+      }
+      VLOG(3) << "    choose algo: " << result.algo << ", TC" << math_type_str
+              << ", time: " << result.time << " ms"
+              << ", wksp = " << result.memory << ", status = " << result.status;
+      return;
+    }
+  }
+}
+
 using framework::ConvSearchCache;

 struct ConvArgs {
@@ -396,6 +466,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
    algo_t algo;
    if (!exhaustive && !deterministic) {
 #if CUDNN_VERSION >= 7001
+      /*
      using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
      int perf_count;
      int best_algo_idx = 0;
@@ -411,7 +482,39 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
      if (workspace_size > workspace_size_limit) {
        workspace_size = workspace_size_limit;
      }
+
+      auto math_type_str = "-";
+      if ((perf_results.get())[best_algo_idx].mathType ==
+          CUDNN_TENSOR_OP_MATH) {
+        math_type_str = "+";
+      }
+      VLOG(3) << "    algo: " << (perf_results.get())[best_algo_idx].algo
+              << ", TC" << math_type_str
+              << ", time: " << (perf_results.get())[best_algo_idx].time << " ms"
+              << ", wksp = " << (perf_results.get())[best_algo_idx].memory
+              << ", status = " << (perf_results.get())[best_algo_idx].status;
+*/
+      auto max_bwd_filt_algos = MaxBackwardFilterAlgos(args.handle);
+      std::vector<cudnnConvolutionBwdFilterAlgoPerf_t> bwd_filt_results(
+          max_bwd_filt_algos);
+      int actual_bwd_filter_algos = 0;
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
+              args.handle, args.idesc.desc(), args.odesc.desc(),
+              args.cdesc.desc(), args.wdesc.desc(), bwd_filt_results.size(),
+              &actual_bwd_filter_algos, bwd_filt_results.data()));
+      bwd_filt_results.resize(actual_bwd_filter_algos);
+      AlgoFinalSelect<cudnnConvolutionBwdFilterAlgoPerf_t,
+                      cudnnConvolutionBwdFilterAlgo_t>(
+          bwd_filt_results, "backprop-to-filter", -1, workspace_size_limit,
+          &algo, deterministic);
+      workspace_size = GetWorkspaceSize(args, algo);
+      if (workspace_size > workspace_size_limit) {
+        workspace_size = workspace_size_limit;
+      }
+
 #else
+      VLOG(3) << "=======cudnnGetConvolutionBackwardFilterAlgorithm=====";
      PADDLE_ENFORCE_CUDA_SUCCESS(
          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
              args.handle, args.idesc.desc(), args.odesc.desc(),
@@ -420,8 +523,10 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
              workspace_size_limit, &algo));
 #endif
    } else if (deterministic) {
+      VLOG(3) << "======choose deterministic algo======";
      return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
    } else {
+      VLOG(3) << "========Get cache algo===========";
      auto& dev_ctx =
          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();

--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -95,6 +95,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  __macro(cudnnGetVersion);                               \
  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);   \
  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
  __macro(cudnnGetErrorString);                           \
  __macro(cudnnCreateDropoutDescriptor);                  \
@@ -194,7 +195,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
  __macro(cudnnBatchNormalizationBackwardEx);                        \
-  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
+  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);     \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif