From eb100c7b4b3cb0f02943c886e0b0b2af025641dd Mon Sep 17 00:00:00 2001 From: Tian Zheng Date: Wed, 2 Nov 2022 10:15:25 +0800 Subject: [PATCH] Add build option for CUDNN Frontend API (#47524) * Add build option for CUDNN Frontend API * Fix review comments * Change namespace for cudnn_frontend.h --- CMakeLists.txt | 2 + cmake/configure.cmake | 4 + cmake/external/cudnn-frontend.cmake | 60 ++++++++ cmake/flags.cmake | 5 + cmake/third_party.cmake | 5 + paddle/fluid/platform/flags.cc | 12 ++ paddle/phi/backends/dynload/CMakeLists.txt | 7 + paddle/phi/backends/dynload/cudnn.cc | 4 + paddle/phi/backends/dynload/cudnn.h | 13 ++ paddle/phi/backends/dynload/cudnn_frontend.h | 62 ++++++++ .../backends/dynload/cudnn_frontend_test.cc | 44 ++++++ .../0001-patch-for-paddle.patch | 137 ++++++++++++++++++ 12 files changed, 355 insertions(+) create mode 100644 cmake/external/cudnn-frontend.cmake create mode 100644 paddle/phi/backends/dynload/cudnn_frontend.h create mode 100644 paddle/phi/backends/dynload/cudnn_frontend_test.cc create mode 100644 patches/cudnn-frontend/0001-patch-for-paddle.patch diff --git a/CMakeLists.txt b/CMakeLists.txt index f2489526c5c..187162cffe7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -305,6 +305,8 @@ option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF) option(WITH_ARM_BRPC "Supprot Brpc in Arm" OFF) option(WITH_FLPS "FL PS mode" OFF) option(WITH_RPC "Compile with rpc support" ${WITH_DISTRIBUTE}) +option(WITH_CUDNN_FRONTEND + "Compile with CUDNN Frontend API support (experimental)" OFF) if(WITH_RECORD_BUILDTIME) set_property( diff --git a/cmake/configure.cmake b/cmake/configure.cmake index c49a879fa02..5147e54ea71 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -248,3 +248,7 @@ endif() if(WITH_GPU_GRAPH) add_definitions(-DPADDLE_WITH_GPU_GRAPH) endif() + +if(WITH_CUDNN_FRONTEND) + add_definitions(-DPADDLE_WITH_CUDNN_FRONTEND) +endif() diff --git a/cmake/external/cudnn-frontend.cmake b/cmake/external/cudnn-frontend.cmake new file mode 100644 index 00000000000..10a49110f84 --- /dev/null +++ b/cmake/external/cudnn-frontend.cmake @@ -0,0 +1,60 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(ExternalProject) + +set(CUDNN_FRONTEND_CUDNN_MIN_VERSION 8000) + +if(NOT WITH_GPU) + message(FATAL_ERROR "Can't enable CUDNN Frontend API without CUDA.") +endif() +if(CUDNN_VERSION LESS 8000) + message( + FATAL_ERROR + "Minimum CUDNN version is ${CUDNN_FRONTEND_CUDNN_MIN_VERSION}. Current: ${CUDNN_VERSION}" + ) +endif() + +# Version: v0.7.1 +set(CUDNN_FRONTEND_PREFIX_DIR ${THIRD_PARTY_PATH}/cudnn-frontend) +set(CUDNN_FRONTEND_SOURCE_DIR + ${THIRD_PARTY_PATH}/cudnn-frontend/src/extern_cudnn_frontend/include) +set(CUDNN_FRONTEND_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git) +set(CUDNN_FRONTEND_TAG v0.7.1) + +set(CUDNN_FRONTEND_INCLUDE_DIR ${CUDNN_FRONTEND_SOURCE_DIR}) +include_directories(${CUDNN_FRONTEND_INCLUDE_DIR}) + +message( + STATUS + "Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_TAG}. Directory: ${CUDNN_FRONTEND_INCLUDE_DIR}" +) + +ExternalProject_Add( + extern_cudnn_frontend + ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} + GIT_REPOSITORY ${CUDNN_FRONTEND_REPOSITORY} + GIT_TAG ${CUDNN_FRONTEND_TAG} + PREFIX ${CUDNN_FRONTEND_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + patch -d ${CUDNN_FRONTEND_SOURCE_DIR} -p2 < + ${PADDLE_SOURCE_DIR}/patches/cudnn-frontend/0001-patch-for-paddle.patch + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "") + +add_library(cudnn-frontend INTERFACE) +add_dependencies(cudnn-frontend extern_cudnn_frontend) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 39261a788bd..a58f4094bbb 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -162,6 +162,11 @@ if(NOT WIN32) ) endif() + if(WITH_CUDNN_FRONTEND) + # flags from https://github.com/NVIDIA/cudnn-frontend/blob/v0.7.1/CMakeLists.txt + set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor) + endif() + if(WITH_ASCEND_CL AND WITH_ARM_BRPC) set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new) endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 4475f5b14d2..28b2bae9518 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -515,4 +515,9 @@ if(WITH_GPU endif() endif() +if(WITH_CUDNN_FRONTEND) + include(external/cudnn-frontend) # download cudnn-frontend + list(APPEND third_party_deps extern_cudnn_frontend) +endif() + add_custom_target(third_party ALL DEPENDS ${third_party_deps}) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index bac075c1d90..23ecfecbbd2 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -1021,3 +1021,15 @@ PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_string(jit_engine_type, "Predictor", "Choose default funciton type in JitLayer."); + +#ifdef PADDLE_WITH_CUDNN_FRONTEND +/** + * CUDNNv8 related FLAG + * Name: enable_cudnn_frontend + * Since Version: 2.5.0 + * Value Range: bool, default=false + * Example: + * Note: Enable CUDNNv8 Frontend API for CUDNN kernels. + */ +PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, ""); +#endif // PADDLE_WITH_CUDNN_FRONTEND diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index 49ab8d4f0c9..98a44461ac4 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -99,3 +99,10 @@ if(MKL_FOUND AND WITH_ONEMKL) DEPS phi_dynamic_loader) target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE}) endif() + +if(WITH_CUDNN_FRONTEND) + nv_test( + cudnn_frontend_test + SRCS cudnn_frontend_test.cc + DEPS phi_dynload_cuda cudnn-frontend) +endif() diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc index 8aa3b623273..9bd38a89ab1 100644 --- a/paddle/phi/backends/dynload/cudnn.cc +++ b/paddle/phi/backends/dynload/cudnn.cc @@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_FRONTEND +CUDNN_DNN_ROUTINE_EACH_FRONTEND(DEFINE_WRAP); +#endif + bool HasCUDNN() { std::call_once(cudnn_dso_flag, []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h index 7b9004308e9..3292beb0371 100644 --- a/paddle/phi/backends/dynload/cudnn.h +++ b/paddle/phi/backends/dynload/cudnn.h @@ -194,6 +194,19 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +#ifdef PADDLE_WITH_CUDNN_FRONTEND +#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \ + __macro(cudnnBackendCreateDescriptor); \ + __macro(cudnnBackendDestroyDescriptor); \ + __macro(cudnnBackendExecute); \ + __macro(cudnnBackendFinalize); \ + __macro(cudnnBackendGetAttribute); \ + __macro(cudnnBackendSetAttribute); \ + __macro(cudnnGetStream); \ + __macro(cudnnReorderFilterAndBias); +CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + } // namespace dynload } // namespace phi diff --git a/paddle/phi/backends/dynload/cudnn_frontend.h b/paddle/phi/backends/dynload/cudnn_frontend.h new file mode 100644 index 00000000000..4d0b67ce228 --- /dev/null +++ b/paddle/phi/backends/dynload/cudnn_frontend.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "gflags/gflags.h" +#include "glog/logging.h" + +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/gpu/gpu_info.h" + +DECLARE_bool(enable_cudnn_frontend); + +// Redirect the CUDNN APIs in the cudnn_frontend namespace to +// the functions in phi::dynload +#define CUDNN_FRONTEND_OVERRIDE_SYMBOL(__name) using phi::dynload::__name + +#define CUDNN_FRONTEND_APPLY_EACH(__macro) \ + __macro(cudnnBackendCreateDescriptor); \ + __macro(cudnnBackendDestroyDescriptor); \ + __macro(cudnnBackendExecute); \ + __macro(cudnnBackendFinalize); \ + __macro(cudnnBackendGetAttribute); \ + __macro(cudnnBackendSetAttribute); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnGetStream); \ + __macro(cudnnGetVersion); \ + __macro(cudnnReorderFilterAndBias); \ + __macro(cudnnSetFilterNdDescriptor); + +namespace cudnn_frontend { +CUDNN_FRONTEND_APPLY_EACH(CUDNN_FRONTEND_OVERRIDE_SYMBOL); +} // namespace cudnn_frontend + +// clang-format off +#include // NOLINT +#include // NOLINT +#include // NOLINT +// clang-format on + +namespace phi { +namespace dynload { +inline bool IsCudnnFrontendEnabled() { + int cudnn_version = phi::backends::gpu::DnnVersion(); + bool flag_enabled = FLAGS_enable_cudnn_frontend && (cudnn_version >= 8000); + VLOG(3) << "[cudnn_frontend] flag_enabled=" << flag_enabled; + return flag_enabled; +} +} // namespace dynload +} // namespace phi diff --git a/paddle/phi/backends/dynload/cudnn_frontend_test.cc b/paddle/phi/backends/dynload/cudnn_frontend_test.cc new file mode 100644 index 00000000000..03425a7545e --- /dev/null +++ b/paddle/phi/backends/dynload/cudnn_frontend_test.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/phi/backends/dynload/cudnn.h" +#include "paddle/phi/backends/dynload/cudnn_frontend.h" + +TEST(CudnnFrontendTest, TensorCreation) { + // Consider creation of a 2d Tensor + // n,c,h,w as 4,32,32,32 + std::cout << "Tensor creation comparison" << std::endl; + std::array tensor_dim = {4, 32, 32, 32}; + std::array tensor_str = {32768, 1024, 32, 1}; // NCHW format + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; + int64_t alignment = sizeof(float); + int64_t id = 0xD0D0CACA; // Some magic number + + try { + auto tensor = cudnn_frontend::TensorBuilder() + .setDim(tensor_dim.size(), tensor_dim.data()) + .setStrides(tensor_str.size(), tensor_str.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(data_type) + .build(); + } catch (cudnn_frontend::cudnnException &e) { + std::cout << "Exception in tensor creation " << e.what() << std::endl; + FAIL(); + } + std::cout << "Finished tensor creation." << std::endl; +} diff --git a/patches/cudnn-frontend/0001-patch-for-paddle.patch b/patches/cudnn-frontend/0001-patch-for-paddle.patch new file mode 100644 index 00000000000..bf5288f06ee --- /dev/null +++ b/patches/cudnn-frontend/0001-patch-for-paddle.patch @@ -0,0 +1,137 @@ +From dce3465da518641ee177187fbc0c0d36faea28f2 Mon Sep 17 00:00:00 2001 +From: Tian Zheng +Date: Thu, 27 Oct 2022 20:33:16 -0700 +Subject: [PATCH] patch for paddle + +--- + include/cudnn_frontend_ExecutionPlan.h | 10 +++++++--- + include/cudnn_frontend_ExecutionPlanCache.h | 2 +- + include/cudnn_frontend_OperationGraph.h | 2 +- + include/cudnn_frontend_find_plan.h | 6 +++--- + include/cudnn_frontend_get_plan.h | 4 ++-- + 5 files changed, 14 insertions(+), 10 deletions(-) + +diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h +index 7bed4b4..3314b5c 100644 +--- a/include/cudnn_frontend_ExecutionPlan.h ++++ b/include/cudnn_frontend_ExecutionPlan.h +@@ -167,6 +167,10 @@ class ExecutionPlan_v8 : public BackendDescriptor { + return json_string; + #endif + } ++ ++ ManagedOpaqueDescriptor GetEngineConfig() const { ++ return engine_config; ++ } + + ExecutionPlan_v8(ExecutionPlan_v8 const &) = default; + ExecutionPlan_v8 & +@@ -182,7 +186,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { + CUDNN_TYPE_NUMERICAL_NOTE, + CUDNN_NUMERICAL_NOTE_TYPE_COUNT, + &elem_count, +- NULL); ++ nullptr); + numeric_notes_vec.resize(elem_count); + status = cudnnBackendGetAttribute(extractedEngine_, + CUDNN_ATTR_ENGINE_NUMERICAL_NOTE, +@@ -206,7 +210,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { + CUDNN_TYPE_BEHAVIOR_NOTE, + CUDNN_BEHAVIOR_NOTE_TYPE_COUNT, + &elem_count, +- NULL); ++ nullptr); + behavior_notes_vec.resize(elem_count); + status = cudnnBackendGetAttribute(extractedEngine_, + CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE, +@@ -310,7 +314,7 @@ class ExecutionPlan_v8 : public BackendDescriptor { + CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE, + CUDNN_TYPE_INT64, + 1, +- NULL, ++ nullptr, + &workSpaceSize); + if (status != CUDNN_STATUS_SUCCESS) { + set_error_and_throw_exception(this, +diff --git a/include/cudnn_frontend_ExecutionPlanCache.h b/include/cudnn_frontend_ExecutionPlanCache.h +index 99a157c..741c490 100644 +--- a/include/cudnn_frontend_ExecutionPlanCache.h ++++ b/include/cudnn_frontend_ExecutionPlanCache.h +@@ -94,7 +94,7 @@ class ExecutionPlanCache_v1 { + + /// String to map of feature_vector to execution plan + /// For a given FeatureVector of type T according to the Operation Graph, we get the plan. +- using FeatureVectorToPlanMap = std::map; ++ using FeatureVectorToPlanMap = std::map; + FeatureVectorToPlanMap cache; + + mutable std::mutex cache_mutex; +diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h +index 1478ce8..7894080 100644 +--- a/include/cudnn_frontend_OperationGraph.h ++++ b/include/cudnn_frontend_OperationGraph.h +@@ -78,7 +78,7 @@ class OperationGraph_v8 : public BackendDescriptor { + CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT, + CUDNN_TYPE_INT64, + 1, +- NULL, ++ nullptr, + &global_count); + if (status != CUDNN_STATUS_SUCCESS) { + set_error_and_throw_exception(this, +diff --git a/include/cudnn_frontend_find_plan.h b/include/cudnn_frontend_find_plan.h +index 02a08a1..5f94e45 100644 +--- a/include/cudnn_frontend_find_plan.h ++++ b/include/cudnn_frontend_find_plan.h +@@ -53,7 +53,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const + cudaDeviceSynchronize(); + + cudaStream_t stream = nullptr; +- ::cudnnGetStream(handle, &stream); ++ cudnnGetStream(handle, &stream); + + for (auto &plan : plans) { + float time_ms = 0.0f; +@@ -61,7 +61,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const + float min_time_ms = std::numeric_limits::max(); + + // Warm-up run +- auto warmup_status = ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); ++ auto warmup_status = cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); + if (warmup_status != CUDNN_STATUS_SUCCESS) { + getLogger() << "[cudnn_frontend] Plan " << plan.getTag() << " failed with " << to_string(warmup_status) << std::endl; + continue; +@@ -71,7 +71,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const + for (int i = 0; i < maxIterCount; i++) { + cudaEventRecord(start, stream); + +- ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); ++ cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc()); + + cudaEventRecord(stop, stream); + cudaEventSynchronize(stop); +diff --git a/include/cudnn_frontend_get_plan.h b/include/cudnn_frontend_get_plan.h +index 50535ab..c43eec9 100644 +--- a/include/cudnn_frontend_get_plan.h ++++ b/include/cudnn_frontend_get_plan.h +@@ -26,7 +26,7 @@ + + namespace cudnn_frontend { + +-auto ++inline auto + EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph) + -> executionPlans_t { + // Creating a set of execution plans that are supported. +@@ -47,7 +47,7 @@ EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGra + return plans; + } + +-auto ++inline auto + EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph, Predicate pred) + -> executionPlans_t { + // Creating a set of execution plans that are supported. +-- +2.25.1 + -- GitLab