未验证 提交 eb100c7b 编写于 作者: T Tian Zheng 提交者: GitHub

Add build option for CUDNN Frontend API (#47524)

* Add build option for CUDNN Frontend API

* Fix review comments

* Change namespace for cudnn_frontend.h
上级 d38010e8
......@@ -305,6 +305,8 @@ option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF)
option(WITH_ARM_BRPC "Supprot Brpc in Arm" OFF)
option(WITH_FLPS "FL PS mode" OFF)
option(WITH_RPC "Compile with rpc support" ${WITH_DISTRIBUTE})
option(WITH_CUDNN_FRONTEND
"Compile with CUDNN Frontend API support (experimental)" OFF)
if(WITH_RECORD_BUILDTIME)
set_property(
......
......@@ -248,3 +248,7 @@ endif()
if(WITH_GPU_GRAPH)
add_definitions(-DPADDLE_WITH_GPU_GRAPH)
endif()
if(WITH_CUDNN_FRONTEND)
add_definitions(-DPADDLE_WITH_CUDNN_FRONTEND)
endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(CUDNN_FRONTEND_CUDNN_MIN_VERSION 8000)
if(NOT WITH_GPU)
message(FATAL_ERROR "Can't enable CUDNN Frontend API without CUDA.")
endif()
if(CUDNN_VERSION LESS 8000)
message(
FATAL_ERROR
"Minimum CUDNN version is ${CUDNN_FRONTEND_CUDNN_MIN_VERSION}. Current: ${CUDNN_VERSION}"
)
endif()
# Version: v0.7.1
set(CUDNN_FRONTEND_PREFIX_DIR ${THIRD_PARTY_PATH}/cudnn-frontend)
set(CUDNN_FRONTEND_SOURCE_DIR
${THIRD_PARTY_PATH}/cudnn-frontend/src/extern_cudnn_frontend/include)
set(CUDNN_FRONTEND_REPOSITORY https://github.com/NVIDIA/cudnn-frontend.git)
set(CUDNN_FRONTEND_TAG v0.7.1)
set(CUDNN_FRONTEND_INCLUDE_DIR ${CUDNN_FRONTEND_SOURCE_DIR})
include_directories(${CUDNN_FRONTEND_INCLUDE_DIR})
message(
STATUS
"Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_TAG}. Directory: ${CUDNN_FRONTEND_INCLUDE_DIR}"
)
ExternalProject_Add(
extern_cudnn_frontend
${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
GIT_REPOSITORY ${CUDNN_FRONTEND_REPOSITORY}
GIT_TAG ${CUDNN_FRONTEND_TAG}
PREFIX ${CUDNN_FRONTEND_PREFIX_DIR}
UPDATE_COMMAND ""
PATCH_COMMAND
patch -d ${CUDNN_FRONTEND_SOURCE_DIR} -p2 <
${PADDLE_SOURCE_DIR}/patches/cudnn-frontend/0001-patch-for-paddle.patch
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND "")
add_library(cudnn-frontend INTERFACE)
add_dependencies(cudnn-frontend extern_cudnn_frontend)
......@@ -162,6 +162,11 @@ if(NOT WIN32)
)
endif()
if(WITH_CUDNN_FRONTEND)
# flags from https://github.com/NVIDIA/cudnn-frontend/blob/v0.7.1/CMakeLists.txt
set(COMMON_FLAGS ${COMMON_FLAGS} -Wno-sign-compare -Wno-non-virtual-dtor)
endif()
if(WITH_ASCEND_CL AND WITH_ARM_BRPC)
set(COMMON_FLAGS ${COMMON_FLAGS} -faligned-new)
endif()
......
......@@ -515,4 +515,9 @@ if(WITH_GPU
endif()
endif()
if(WITH_CUDNN_FRONTEND)
include(external/cudnn-frontend) # download cudnn-frontend
list(APPEND third_party_deps extern_cudnn_frontend)
endif()
add_custom_target(third_party ALL DEPENDS ${third_party_deps})
......@@ -1021,3 +1021,15 @@ PADDLE_DEFINE_EXPORTED_bool(
PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
"Predictor",
"Choose default funciton type in JitLayer.");
#ifdef PADDLE_WITH_CUDNN_FRONTEND
/**
* CUDNNv8 related FLAG
* Name: enable_cudnn_frontend
* Since Version: 2.5.0
* Value Range: bool, default=false
* Example:
* Note: Enable CUDNNv8 Frontend API for CUDNN kernels.
*/
PADDLE_DEFINE_EXPORTED_bool(enable_cudnn_frontend, false, "");
#endif // PADDLE_WITH_CUDNN_FRONTEND
......@@ -99,3 +99,10 @@ if(MKL_FOUND AND WITH_ONEMKL)
DEPS phi_dynamic_loader)
target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE})
endif()
if(WITH_CUDNN_FRONTEND)
nv_test(
cudnn_frontend_test
SRCS cudnn_frontend_test.cc
DEPS phi_dynload_cuda cudnn-frontend)
endif()
......@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
#endif
#ifdef CUDNN_DNN_ROUTINE_EACH_FRONTEND
CUDNN_DNN_ROUTINE_EACH_FRONTEND(DEFINE_WRAP);
#endif
bool HasCUDNN() {
std::call_once(cudnn_dso_flag,
[]() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
......
......@@ -194,6 +194,19 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#ifdef PADDLE_WITH_CUDNN_FRONTEND
#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \
__macro(cudnnBackendCreateDescriptor); \
__macro(cudnnBackendDestroyDescriptor); \
__macro(cudnnBackendExecute); \
__macro(cudnnBackendFinalize); \
__macro(cudnnBackendGetAttribute); \
__macro(cudnnBackendSetAttribute); \
__macro(cudnnGetStream); \
__macro(cudnnReorderFilterAndBias);
CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
} // namespace dynload
} // namespace phi
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "gflags/gflags.h"
#include "glog/logging.h"
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
DECLARE_bool(enable_cudnn_frontend);
// Redirect the CUDNN APIs in the cudnn_frontend namespace to
// the functions in phi::dynload
#define CUDNN_FRONTEND_OVERRIDE_SYMBOL(__name) using phi::dynload::__name
#define CUDNN_FRONTEND_APPLY_EACH(__macro) \
__macro(cudnnBackendCreateDescriptor); \
__macro(cudnnBackendDestroyDescriptor); \
__macro(cudnnBackendExecute); \
__macro(cudnnBackendFinalize); \
__macro(cudnnBackendGetAttribute); \
__macro(cudnnBackendSetAttribute); \
__macro(cudnnCreateFilterDescriptor); \
__macro(cudnnDestroyFilterDescriptor); \
__macro(cudnnGetStream); \
__macro(cudnnGetVersion); \
__macro(cudnnReorderFilterAndBias); \
__macro(cudnnSetFilterNdDescriptor);
namespace cudnn_frontend {
CUDNN_FRONTEND_APPLY_EACH(CUDNN_FRONTEND_OVERRIDE_SYMBOL);
} // namespace cudnn_frontend
// clang-format off
#include <cudnn_frontend.h> // NOLINT
#include <cudnn_frontend_find_plan.h> // NOLINT
#include <cudnn_frontend_get_plan.h> // NOLINT
// clang-format on
namespace phi {
namespace dynload {
inline bool IsCudnnFrontendEnabled() {
int cudnn_version = phi::backends::gpu::DnnVersion();
bool flag_enabled = FLAGS_enable_cudnn_frontend && (cudnn_version >= 8000);
VLOG(3) << "[cudnn_frontend] flag_enabled=" << flag_enabled;
return flag_enabled;
}
} // namespace dynload
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <iostream>
#include "paddle/phi/backends/dynload/cudnn.h"
#include "paddle/phi/backends/dynload/cudnn_frontend.h"
TEST(CudnnFrontendTest, TensorCreation) {
// Consider creation of a 2d Tensor
// n,c,h,w as 4,32,32,32
std::cout << "Tensor creation comparison" << std::endl;
std::array<int64_t, 4> tensor_dim = {4, 32, 32, 32};
std::array<int64_t, 4> tensor_str = {32768, 1024, 32, 1}; // NCHW format
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
int64_t alignment = sizeof(float);
int64_t id = 0xD0D0CACA; // Some magic number
try {
auto tensor = cudnn_frontend::TensorBuilder()
.setDim(tensor_dim.size(), tensor_dim.data())
.setStrides(tensor_str.size(), tensor_str.data())
.setId(id)
.setAlignment(alignment)
.setDataType(data_type)
.build();
} catch (cudnn_frontend::cudnnException &e) {
std::cout << "Exception in tensor creation " << e.what() << std::endl;
FAIL();
}
std::cout << "Finished tensor creation." << std::endl;
}
From dce3465da518641ee177187fbc0c0d36faea28f2 Mon Sep 17 00:00:00 2001
From: Tian Zheng <tizheng@nvidia.com>
Date: Thu, 27 Oct 2022 20:33:16 -0700
Subject: [PATCH] patch for paddle
---
include/cudnn_frontend_ExecutionPlan.h | 10 +++++++---
include/cudnn_frontend_ExecutionPlanCache.h | 2 +-
include/cudnn_frontend_OperationGraph.h | 2 +-
include/cudnn_frontend_find_plan.h | 6 +++---
include/cudnn_frontend_get_plan.h | 4 ++--
5 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h
index 7bed4b4..3314b5c 100644
--- a/include/cudnn_frontend_ExecutionPlan.h
+++ b/include/cudnn_frontend_ExecutionPlan.h
@@ -167,6 +167,10 @@ class ExecutionPlan_v8 : public BackendDescriptor {
return json_string;
#endif
}
+
+ ManagedOpaqueDescriptor GetEngineConfig() const {
+ return engine_config;
+ }
ExecutionPlan_v8(ExecutionPlan_v8 const &) = default;
ExecutionPlan_v8 &
@@ -182,7 +186,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
CUDNN_TYPE_NUMERICAL_NOTE,
CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
&elem_count,
- NULL);
+ nullptr);
numeric_notes_vec.resize(elem_count);
status = cudnnBackendGetAttribute(extractedEngine_,
CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
@@ -206,7 +210,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
CUDNN_TYPE_BEHAVIOR_NOTE,
CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
&elem_count,
- NULL);
+ nullptr);
behavior_notes_vec.resize(elem_count);
status = cudnnBackendGetAttribute(extractedEngine_,
CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
@@ -310,7 +314,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
CUDNN_TYPE_INT64,
1,
- NULL,
+ nullptr,
&workSpaceSize);
if (status != CUDNN_STATUS_SUCCESS) {
set_error_and_throw_exception(this,
diff --git a/include/cudnn_frontend_ExecutionPlanCache.h b/include/cudnn_frontend_ExecutionPlanCache.h
index 99a157c..741c490 100644
--- a/include/cudnn_frontend_ExecutionPlanCache.h
+++ b/include/cudnn_frontend_ExecutionPlanCache.h
@@ -94,7 +94,7 @@ class ExecutionPlanCache_v1 {
/// String to map of feature_vector to execution plan
/// For a given FeatureVector of type T according to the Operation Graph, we get the plan.
- using FeatureVectorToPlanMap = std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan, cudnn_frontend::ExecutionPlanCache_v1::compare>;
+ using FeatureVectorToPlanMap = std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan>;
FeatureVectorToPlanMap cache;
mutable std::mutex cache_mutex;
diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
index 1478ce8..7894080 100644
--- a/include/cudnn_frontend_OperationGraph.h
+++ b/include/cudnn_frontend_OperationGraph.h
@@ -78,7 +78,7 @@ class OperationGraph_v8 : public BackendDescriptor {
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT,
CUDNN_TYPE_INT64,
1,
- NULL,
+ nullptr,
&global_count);
if (status != CUDNN_STATUS_SUCCESS) {
set_error_and_throw_exception(this,
diff --git a/include/cudnn_frontend_find_plan.h b/include/cudnn_frontend_find_plan.h
index 02a08a1..5f94e45 100644
--- a/include/cudnn_frontend_find_plan.h
+++ b/include/cudnn_frontend_find_plan.h
@@ -53,7 +53,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
cudaDeviceSynchronize();
cudaStream_t stream = nullptr;
- ::cudnnGetStream(handle, &stream);
+ cudnnGetStream(handle, &stream);
for (auto &plan : plans) {
float time_ms = 0.0f;
@@ -61,7 +61,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
float min_time_ms = std::numeric_limits<float>::max();
// Warm-up run
- auto warmup_status = ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+ auto warmup_status = cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
if (warmup_status != CUDNN_STATUS_SUCCESS) {
getLogger() << "[cudnn_frontend] Plan " << plan.getTag() << " failed with " << to_string(warmup_status) << std::endl;
continue;
@@ -71,7 +71,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
for (int i = 0; i < maxIterCount; i++) {
cudaEventRecord(start, stream);
- ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+ cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
diff --git a/include/cudnn_frontend_get_plan.h b/include/cudnn_frontend_get_plan.h
index 50535ab..c43eec9 100644
--- a/include/cudnn_frontend_get_plan.h
+++ b/include/cudnn_frontend_get_plan.h
@@ -26,7 +26,7 @@
namespace cudnn_frontend {
-auto
+inline auto
EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph)
-> executionPlans_t {
// Creating a set of execution plans that are supported.
@@ -47,7 +47,7 @@ EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGra
return plans;
}
-auto
+inline auto
EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph, Predicate pred)
-> executionPlans_t {
// Creating a set of execution plans that are supported.
--
2.25.1
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册