Add some check for CUDA Driver API and NVRTC (#22719)

* Add the check for whether CUDA Driver and NVRTC is available for the runtime system. * Call cuInit to initialize the CUDA Driver API before all CUDA callings. test=develop * Change the behavior when libnvrtc.so can not be found, printing a warning instead of exiting. test=develop * Do not initialize CUDA Driver API for windows and macos. test=develop * Remove the call of cuInit when entering paddle and enable the test_code_generator. test=develop * Add some built-in functions for __half. test=develop * Change save_intermediate_out to false in unittest. test=develop * Fix error reference to tempropary variable when seting including path for device_code. test=develop

Add some check for CUDA Driver API and NVRTC (#22719)
* Add the check for whether CUDA Driver and NVRTC is available for the runtime system. * Call cuInit to initialize the CUDA Driver API before all CUDA callings. test=develop * Change the behavior when libnvrtc.so can not be found, printing a warning instead of exiting. test=develop * Do not initialize CUDA Driver API for windows and macos. test=develop * Remove the call of cuInit when entering paddle and enable the test_code_generator. test=develop * Add some built-in functions for __half. test=develop * Change save_intermediate_out to false in unittest. test=develop * Fix error reference to tempropary variable when seting including path for device_code. test=develop
560c8153 · Yiqun Liu · GitHub · 9ec5eb0e · 560c8153 · 560c8153
11 changed file
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(code_generator
    SRCS operation.cc code_generator.cc code_generator_helper.cc
    DEPS graph subgraph_detector)
 if(WITH_GPU)
-#    cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass)
+    cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass)
 endif()

 cc_library(fusion_group_pass

--- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc
@@ -209,7 +209,10 @@ std::set<int> CodeGenerator::DistilIntermediateIds(
  for (size_t i = 0; i < expressions.size(); i++) {
    for (auto id : expressions[i].GetOutputIds()) {
      auto intermediate_state = expressions[i].GetIntermediateState();
-      if (intermediate_state[id]) intermediate_ids.insert(id);
+      if (intermediate_state.find(id) != intermediate_state.end() &&
+          intermediate_state[id]) {
+        intermediate_ids.insert(id);
+      }
    }
  }
  return intermediate_ids;

--- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
@@ -44,10 +44,11 @@ static inline std::string VarName(int index) {

 class OperationExpression {
 public:
-  explicit OperationExpression(std::string op_type, std::vector<int> input_ids,
-                               std::vector<int> output_ids,
-                               std::string rhs_type, std::string lhs_type,
-                               std::unordered_map<int, bool> intermediate_state)
+  explicit OperationExpression(
+      std::string op_type, const std::vector<int>& input_ids,
+      const std::vector<int>& output_ids, std::string rhs_type,
+      std::string lhs_type,
+      const std::unordered_map<int, bool>& intermediate_state = {})
      : op_type_(op_type),
        input_ids_(input_ids),
        output_ids_(output_ids),

--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -176,7 +176,6 @@ void TestMainImpl(std::string func_name, std::string code_str,
  bool is_float16 = std::type_index(typeid(T)) ==
                    std::type_index(typeid(paddle::platform::float16));

-  paddle::framework::InitDevices(false, {0});
  paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
  paddle::platform::CUDADeviceCode device_code(place, func_name, code_str);
  device_code.Compile(is_float16);
@@ -266,7 +265,7 @@ void TestElementwiseMain(
  }

  int n = cpu_tensors[0].numel();
-  if (dtype == "float16") {
+  if (dtype == "__half") {
    TestMainImpl<paddle::platform::float16>(func_name, code_str, cpu_tensors, n,
                                            input_ids, output_ids);
  } else {
@@ -275,7 +274,7 @@ void TestElementwiseMain(
  }

  // Check the results
-  float eps = (dtype == "float16") ? 1E-2 : 1E-5;
+  float eps = (dtype == "__half") ? 1E-2 : 1E-5;
  for (int i = 0; i < n; i++) {
    fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
                              i, eps);
@@ -312,7 +311,7 @@ void TestMain(fusion_group::SubGraph* subgraph, std::vector<int> input_ids,
 }

 TEST(code_generator, elementwise) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
    // t2 = t0 * t1
    // t4 = t2 + t3
    // t6 = t4 - t5
@@ -342,7 +341,7 @@ TEST(code_generator, elementwise) {
 }

 TEST(code_generator, elementwise_grad) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
    // The var order: t0, t1, t2, t3, t0', t1', t2', t3'
    // t2 = t0 * t1
    // t3 = relu(t2)
@@ -407,7 +406,7 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(bool backward,

  std::unique_ptr<paddle::framework::ir::Graph> graph(
      new paddle::framework::ir::Graph(layers.main_program()));
-  auto proto_dtype = (dtype == "float16")
+  auto proto_dtype = (dtype == "__half")
                         ? paddle::framework::proto::VarType::FP16
                         : paddle::framework::proto::VarType::FP32;
  for (auto* n : graph->Nodes()) {
@@ -463,10 +462,10 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes(
 }

 TEST(code_generator, subgraph) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
    std::unique_ptr<paddle::framework::ir::Graph> graph =
        BuildGraph(false, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
+    fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false,
                                    graph->Nodes());

    // Expressions generated by code_generator (they may be different):
@@ -482,10 +481,10 @@ TEST(code_generator, subgraph) {
 }

 TEST(code_generator, subgraph_grad) {
-  for (std::string dtype : {"float", "float16"}) {
+  for (std::string dtype : {"float", "__half"}) {
    std::unique_ptr<paddle::framework::ir::Graph> graph =
        BuildGraph(true, dtype);
-    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
+    fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false,
                                    DistilGradNodes(graph));

    // Expressions generated by code_generator (they may be different):

--- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
@@ -35,6 +35,7 @@ __device__ inline double Sqrt(double x) { return sqrt(x); }

 )";

+// List some bulit-in functions of __half implemented in cuda_fp16.hpp
 static constexpr char predefined_cuda_functions_fp16[] = R"(
 #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
 #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
@@ -193,6 +194,12 @@ __CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) {
    return v;
 }

+__CUDA_FP16_DECL__ __half __hneg(const __half a)
+{
+    __half zero;
+    zero = __float2half(0.0);
+    return __hsub(zero, a);
+}

 /* Some basic arithmetic operations expected of a builtin */
 __device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
@@ -200,6 +207,10 @@ __device__ __forceinline__ __half operator-(const __half &lh, const __half &rh)
 __device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
 __device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }

+/* Unary plus and inverse operators */
+__device__ __forceinline__ __half operator+(const __half &h) { return h; }
+__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
+
 /* Some basic comparison operations to make it look like a builtin */
 __device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
 __device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); }

--- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -30,6 +30,14 @@ namespace ir {
 void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
  FusePassBase::Init("fusion_group_pass", graph);
  if (Get<bool>("use_gpu")) {
+    // TODO(liuyiqun): open this check.
+    // if (!platform::CUDADeviceCode::IsAvailable()) {
+    //   LOG(WARNING)
+    //       << "Disable fusion_group because CUDA Driver or NVRTC is not
+    //       avaiable.";
+    //   return 0;
+    // }
+
    fusion_group::OperationMap::Init();
    int num_elementwise_groups = DetectFusionGroup(graph, 0);
    AddStatis(num_elementwise_groups);
@@ -61,7 +69,7 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
      subgraph.DetectIntermediateOutWithGraph(graph);
    }
    if (subgraph.IsValid(min_subgraph_size)) {
-      subgraph.SetFuncName("FusedElementwise" + std::to_string(index++));
+      subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
      if (GenerateCode(&subgraph)) {
        InsertFusionGroupOp(graph, &subgraph);
        num_subgraphs++;

--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -79,9 +79,61 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
 #endif
    }
  }
+
+#ifdef PADDLE_WITH_CUDA
+  CUDADeviceCode::CheckAvailableStatus();
+#endif
 }

 #ifdef PADDLE_WITH_CUDA
+static bool CheckCUDADriverResult(CUresult result, std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != CUDA_SUCCESS) {
+    const char* error = nullptr;
+    dynload::cuGetErrorString(result, &error);
+    LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
+                            << " > failed: " << error << " (" << result << ")";
+    return false;
+  }
+  return true;
+}
+
+bool CUDADeviceCode::available_ = false;
+void CUDADeviceCode::CheckAvailableStatus() {
+  available_ = false;
+  if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
+    return;
+  }
+
+  int nvrtc_major = 0;
+  int nvrtc_minor = 0;
+  nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
+
+  int driver_version = 0;
+  int dirver_major = 0;
+  int driver_minor = 0;
+  CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
+  if (driver_result == CUDA_SUCCESS) {
+    dirver_major = driver_version / 1000;
+    driver_minor = (driver_version % 1000) / 10;
+  }
+
+  LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
+                       << driver_minor << "; NVRTC Version: " << nvrtc_major
+                       << "." << nvrtc_minor;
+  if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
+    return;
+  }
+
+  int count = 0;
+  if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
+                            "cuDeviceGetCount")) {
+    available_ = true;
+  }
+}
+
 static std::string FindCUDAIncludePath() {
  auto EndWith = [](std::string str, std::string substr) -> bool {
    size_t pos = str.rfind(substr);
@@ -137,7 +189,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
 bool CUDADeviceCode::Compile(bool include_path) {
  is_compiled_ = false;
  if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) {
-    LOG(WARNING)
+    LOG_FIRST_N(WARNING, 1)
        << "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
    return false;
  }
@@ -160,10 +212,11 @@ bool CUDADeviceCode::Compile(bool include_path) {
  std::string compute_flag =
      "--gpu-architecture=compute_" + std::to_string(compute_capability);
  std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
+  std::string include_option;
  if (include_path) {
    std::string cuda_include_path = FindCUDAIncludePath();
    if (!cuda_include_path.empty()) {
-      std::string include_option = "--include-path=" + cuda_include_path;
+      include_option = "--include-path=" + cuda_include_path;
      options.push_back(include_option.c_str());
    }
  }
@@ -209,13 +262,13 @@ bool CUDADeviceCode::Compile(bool include_path) {
  }

  if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, ptx_.data()),
-                             "cuModuleLoadData")) {
+                             "cuModuleLoadData", name_)) {
    return false;
  }

  if (!CheckCUDADriverResult(
          dynload::cuModuleGetFunction(&function_, module_, name_.c_str()),
-          "cuModuleGetFunction")) {
+          "cuModuleGetFunction", name_)) {
    return false;
  }

@@ -253,19 +306,9 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
 bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
                                      std::string function) {
  if (result != NVRTC_SUCCESS) {
-    LOG(WARNING) << "Call " << function
-                 << " failed: " << dynload::nvrtcGetErrorString(result);
-    return false;
-  }
-  return true;
-}
-
-bool CUDADeviceCode::CheckCUDADriverResult(CUresult result,
-                                           std::string function) {
-  if (result != CUDA_SUCCESS) {
-    const char* error = nullptr;
-    LOG(WARNING) << "Call " << function
-                 << " failed: " << dynload::cuGetErrorString(result, &error);
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::nvrtcGetErrorString(result);
    return false;
  }
  return true;

--- a/paddle/fluid/platform/device_code.h
+++ b/paddle/fluid/platform/device_code.h
@@ -56,9 +56,13 @@ class CUDADeviceCode : public DeviceCode {
    workload_per_thread_ = workload_per_thread;
  }

+  static void CheckAvailableStatus();
+  static bool IsAvailable() { return available_; }
+
 private:
  bool CheckNVRTCResult(nvrtcResult result, std::string function);
-  bool CheckCUDADriverResult(CUresult result, std::string function);
+
+  static bool available_;

  bool is_compiled_{false};
  int max_threads_{0};

--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -60,6 +60,8 @@ extern bool HasCUDADriver();
 * include all needed cuda driver functions
 **/
 #define CUDA_ROUTINE_EACH(__macro)                      \
+  __macro(cuInit);                                      \
+  __macro(cuDriverGetVersion);                          \
  __macro(cuGetErrorString);                            \
  __macro(cuModuleLoadData);                            \
  __macro(cuModuleGetFunction);                         \
@@ -68,7 +70,7 @@ extern bool HasCUDADriver();
  __macro(cuLaunchKernel);                              \
  __macro(cuCtxCreate);                                 \
  __macro(cuCtxGetCurrent);                             \
-  __macro(cuDeviceGet);                                 \
+  __macro(cuDeviceGetCount);                            \
  __macro(cuDevicePrimaryCtxGetState)

 CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);

--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -164,10 +164,10 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
  }
  auto error_msg =
      "Failed to find dynamic library: %s ( %s ) \n Please specify "
-      "its path correctly using following ways: \n Method. set "
+      "its path correctly using following ways: \n   set "
      "environment variable LD_LIBRARY_PATH on Linux or "
-      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
-      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "DYLD_LIBRARY_PATH on Mac OS. \n   For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n   Note: After Mac OS 10.11, "
      "using the DYLD_LIBRARY_PATH is impossible unless System "
      "Integrity Protection (SIP) is disabled.";
 #if !defined(_WIN32)
@@ -238,17 +238,17 @@ void* GetCusolverDsoHandle() {

 void* GetNVRTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
 }

 void* GetCUDADsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so");
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
 #endif
 }


--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -60,6 +60,7 @@ extern bool HasNVRTC();
 * include all needed nvrtc functions
 **/
 #define NVRTC_ROUTINE_EACH(__macro) \
+  __macro(nvrtcVersion);            \
  __macro(nvrtcGetErrorString);     \
  __macro(nvrtcCompileProgram);     \
  __macro(nvrtcCreateProgram);      \