未验证 提交 76b77d81 编写于 作者: T Tian Zheng 提交者: GitHub

Update CUDNN Frontend API to v0.9.1 (#54949)

* Update CUDNN Frontend API to v0.9.1
- Remove old patches
- Remove workarounds that are no longer needed

* Fix test_switch_autotune
上级 f1bffdac
...@@ -28,24 +28,24 @@ endif() ...@@ -28,24 +28,24 @@ endif()
if((NOT DEFINED CUDNN_FRONTEND_NAME) OR (NOT DEFINED CUDNN_FRONTEND_URL)) if((NOT DEFINED CUDNN_FRONTEND_NAME) OR (NOT DEFINED CUDNN_FRONTEND_URL))
set(CUDNN_FRONTEND_VER set(CUDNN_FRONTEND_VER
"1.23.2" "v0.9.1"
CACHE STRING "" FORCE) CACHE STRING "" FORCE)
set(CUDNN_FRONTEND_NAME set(CUDNN_FRONTEND_NAME
"cudnn-frontend" "cudnn-frontend"
CACHE STRING "" FORCE) CACHE STRING "" FORCE)
set(CUDNN_FRONTEND_URL set(CUDNN_FRONTEND_URL
"https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.7.1.tar.gz" "https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/${CUDNN_FRONTEND_VER}.tar.gz"
CACHE STRING "" FORCE) CACHE STRING "" FORCE)
set(CUDNN_FRONTEND_CACHE_FILENAME "v0.7.1.tar.gz")
endif() endif()
set(CUDNN_FRONTEND_URL_MD5 "d8f911df571f8b0d40226efa9c0150c8") set(CUDNN_FRONTEND_CACHE_FILENAME "${CUDNN_FRONTEND_VER}.tar.gz")
set(CUDNN_FRONTEND_URL_MD5 "da7cbad1305427f687dd4fd737178f80")
message( message(
STATUS STATUS
"CUDNN_FRONTEND_NAME: ${CUDNN_FRONTEND_NAME}, CUDNN_FRONTEND_URL: ${CUDNN_FRONTEND_URL}" "CUDNN_FRONTEND_NAME: ${CUDNN_FRONTEND_NAME}, CUDNN_FRONTEND_URL: ${CUDNN_FRONTEND_URL}"
) )
set(DIRENT_DOWNLOAD_DIR "${PADDLE_SOURCE_DIR}/third_party/cudnn-frontend") set(CUDNN_FRONTEND_DOWNLOAD_DIR
# Version: v0.7.1 "${PADDLE_SOURCE_DIR}/third_party/cudnn-frontend")
set(CUDNN_FRONTEND_PREFIX_DIR ${THIRD_PARTY_PATH}/cudnn-frontend) set(CUDNN_FRONTEND_PREFIX_DIR ${THIRD_PARTY_PATH}/cudnn-frontend)
set(CUDNN_FRONTEND_SOURCE_DIR set(CUDNN_FRONTEND_SOURCE_DIR
${THIRD_PARTY_PATH}/cudnn-frontend/src/extern_cudnn_frontend/include) ${THIRD_PARTY_PATH}/cudnn-frontend/src/extern_cudnn_frontend/include)
...@@ -55,7 +55,7 @@ include_directories(${CUDNN_FRONTEND_INCLUDE_DIR}) ...@@ -55,7 +55,7 @@ include_directories(${CUDNN_FRONTEND_INCLUDE_DIR})
message( message(
STATUS STATUS
"Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_VER}. Directory: ${DIRENT_DOWNLOAD_DIR}" "Adding cudnn-frontend. Version: ${CUDNN_FRONTEND_VER}. Directory: ${CUDNN_FRONTEND_DOWNLOAD_DIR}"
) )
function(download_cudnn_frontend) function(download_cudnn_frontend)
...@@ -99,9 +99,7 @@ ExternalProject_Add( ...@@ -99,9 +99,7 @@ ExternalProject_Add(
DOWNLOAD_DIR ${CUDNN_FRONTEND_DOWNLOAD_DIR} DOWNLOAD_DIR ${CUDNN_FRONTEND_DOWNLOAD_DIR}
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND "" UPDATE_COMMAND ""
PATCH_COMMAND PATCH_COMMAND ""
patch -d ${CUDNN_FRONTEND_SOURCE_DIR} -p2 <
${PADDLE_SOURCE_DIR}/patches/cudnn-frontend/0001-patch-for-paddle.patch
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
BUILD_COMMAND "" BUILD_COMMAND ""
INSTALL_COMMAND "" INSTALL_COMMAND ""
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <map> #include <map>
#include <mutex> #include <mutex>
#include <string> #include <string>
#include <thread>
#include <vector> #include <vector>
#include "paddle/phi/backends/dynload/cudnn_frontend.h" #include "paddle/phi/backends/dynload/cudnn_frontend.h"
...@@ -34,7 +35,13 @@ class CudnnFrontendPlanCache { ...@@ -34,7 +35,13 @@ class CudnnFrontendPlanCache {
saturation_count_ = FLAGS_cudnn_cache_saturation_count; saturation_count_ = FLAGS_cudnn_cache_saturation_count;
} }
int64_t Size() const { return map_.size(); } int64_t Size() const {
int64_t total_size = 0;
for (auto it = map_.begin(); it != map_.end(); it++) {
total_size += (it->second).size();
}
return total_size;
}
int64_t CacheHits() const { return cache_hits_; } int64_t CacheHits() const { return cache_hits_; }
...@@ -58,11 +65,12 @@ class CudnnFrontendPlanCache { ...@@ -58,11 +65,12 @@ class CudnnFrontendPlanCache {
cache_misses_ = 0; cache_misses_ = 0;
} }
bool FindPlan(const cudnn_frontend::OperationGraph& op_graph, bool FindPlan(const cudnn_frontend::feature_vector_t &feature,
bool use_addto = false) { cudnnHandle_t handle) {
bool ret = false; bool ret = false;
std::lock_guard<std::mutex> lock(*cache_mutex_); std::lock_guard<std::mutex> lock(*cache_mutex_);
if (map_.count(MakeKey(op_graph, use_addto)) > 0) { auto &local_map = map_[hasher(std::this_thread::get_id())];
if (local_map.count(GetExtendedFeature(feature, handle)) > 0) {
cache_hits_++; cache_hits_++;
ret = true; ret = true;
} else { } else {
...@@ -71,58 +79,98 @@ class CudnnFrontendPlanCache { ...@@ -71,58 +79,98 @@ class CudnnFrontendPlanCache {
return ret; return ret;
} }
cudnn_frontend::ManagedOpaqueDescriptor GetConfig( void GetPlan(const cudnn_frontend::feature_vector_t &feature,
const cudnn_frontend::OperationGraph& op_graph, const cudnn_frontend::ExecutionPlan **plan,
cudnnHandle_t handle, int64_t *workspace_size,
bool use_addto = false) { cudnnHandle_t handle) {
// Note(tizheng): CUDNNv8 execution plan is not thread-safe.
// A shared plan being executed by different threads is
// generally not safe (for now).
std::lock_guard<std::mutex> lock(*cache_mutex_); std::lock_guard<std::mutex> lock(*cache_mutex_);
auto engine_config = map_[MakeKey(op_graph, use_addto)]; auto &local_map = map_[hasher(std::this_thread::get_id())];
return engine_config;
auto it = local_map.find(GetExtendedFeature(feature, handle));
if (it == local_map.end()) {
PADDLE_THROW(phi::errors::InvalidArgument(
"[cudnn_frontend] Cached Plan Not Found."));
return;
}
*plan = &(it->second);
*workspace_size = (*plan)->getWorkspaceSize();
VLOG(4) << "Cached execution plan found." << (*plan)->getTag()
<< "; Require workspace: " << *workspace_size;
} }
void InsertPlan(const cudnn_frontend::OperationGraph& op_graph, void InsertPlan(const cudnn_frontend::feature_vector_t &feature,
const cudnn_frontend::ExecutionPlan& plan, const cudnn_frontend::ExecutionPlan &plan,
bool use_addto = false) { cudnnHandle_t handle) {
VLOG(4) << "[cudnn_frontend] cache: Insert graph tag: " VLOG(4) << "[cudnn_frontend] cache: Insert plan: " << plan.getTag();
<< op_graph.getTag();
std::lock_guard<std::mutex> lock(*cache_mutex_); std::lock_guard<std::mutex> lock(*cache_mutex_);
map_.insert( auto &local_map = map_[hasher(std::this_thread::get_id())];
std::make_pair(MakeKey(op_graph, use_addto), plan.GetEngineConfig())); local_map.insert(std::make_pair(GetExtendedFeature(feature, handle), plan));
} }
bool IsStable(const cudnn_frontend::OperationGraph& op_graph, bool IsStable(const cudnn_frontend::feature_vector_t &feature,
const std::string& tag, const std::string &tag,
bool use_addto = false) { cudnnHandle_t handle) {
if (saturation_count_ == 1) { if (saturation_count_ == 1) {
return true; return true;
} }
std::lock_guard<std::mutex> lock(*cache_mutex_); std::lock_guard<std::mutex> lock(*cache_mutex_);
if (map_.count(MakeKey(op_graph, use_addto))) { auto &local_map = map_[hasher(std::this_thread::get_id())];
auto &local_tracker = tracker_[hasher(std::this_thread::get_id())];
auto ext_feature = GetExtendedFeature(feature, handle);
if (local_map.count(ext_feature)) {
return false; return false;
} }
int cnt = tracker_[std::make_pair(MakeKey(op_graph, use_addto), tag)] += 1; int cnt = local_tracker[std::make_pair(ext_feature, tag)] += 1;
VLOG(4) << "[cudnn_frontend] SaturationTracker: " << op_graph.getTag() VLOG(4) << "[cudnn_frontend] SaturationTracker: " << tag << " " << cnt;
<< " " << tag << " " << cnt;
return cnt >= saturation_count_; return cnt >= saturation_count_;
} }
bool FindPlan(const cudnn_frontend::OperationGraph &op_graph,
cudnnHandle_t handle) {
return FindPlan(op_graph.getFeatureVector(), handle);
}
void GetPlan(const cudnn_frontend::OperationGraph &op_graph,
const cudnn_frontend::ExecutionPlan **plan,
int64_t *workspace_size,
cudnnHandle_t handle) {
GetPlan(op_graph.getFeatureVector(), plan, workspace_size, handle);
}
void InsertPlan(const cudnn_frontend::OperationGraph &op_graph,
const cudnn_frontend::ExecutionPlan &plan,
cudnnHandle_t handle) {
InsertPlan(op_graph.getFeatureVector(), plan, handle);
}
bool IsStable(const cudnn_frontend::OperationGraph &op_graph,
const std::string &tag,
cudnnHandle_t handle) {
return IsStable(op_graph.getFeatureVector(), tag, handle);
}
private: private:
static cudnn_frontend::feature_vector_t MakeKey( cudnn_frontend::feature_vector_t GetExtendedFeature(
const cudnn_frontend::OperationGraph& op_graph, bool use_addto) { cudnn_frontend::feature_vector_t feat, cudnnHandle_t handle) {
auto key = op_graph.getFeatureVector(); int64_t val = 0;
key.push_back(static_cast<uint64_t>(use_addto)); memcpy(&val, &handle, sizeof(int64_t));
return key; feat.push_back(val);
return feat;
} }
using FeatureVectorToPlanMap =
std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan>;
std::map<std::size_t, FeatureVectorToPlanMap> map_;
std::hash<std::thread::id> hasher;
std::map<cudnn_frontend::feature_vector_t,
cudnn_frontend::ManagedOpaqueDescriptor>
map_;
std::shared_ptr<std::mutex> cache_mutex_; std::shared_ptr<std::mutex> cache_mutex_;
int saturation_count_; int saturation_count_;
using SaturationTracker = using SaturationTracker =
std::map<std::pair<cudnn_frontend::feature_vector_t, std::string>, int>; std::map<std::pair<cudnn_frontend::feature_vector_t, std::string>, int>;
SaturationTracker tracker_; std::map<std::size_t, SaturationTracker> tracker_;
int64_t cache_hits_{0}; int64_t cache_hits_{0};
int64_t cache_misses_{0}; int64_t cache_misses_{0};
......
...@@ -26,6 +26,7 @@ limitations under the License. */ ...@@ -26,6 +26,7 @@ limitations under the License. */
#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/core/utils/data_type.h"
#include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
namespace phi { namespace phi {
...@@ -102,6 +103,33 @@ class CudnnFrontendConvHelper { ...@@ -102,6 +103,33 @@ class CudnnFrontendConvHelper {
.build(); .build();
} }
static inline cudnn_frontend::Tensor GetGeneralTensorDescriptor(
std::vector<int64_t> dims,
cudnnTensorFormat_t layout,
int64_t id,
int64_t alignment,
cudnnDataType_t dtype,
bool is_virtual = false,
int64_t group_count = 0) {
std::vector<int64_t> strides = GenerateStrides(dims, layout);
if (group_count > 0) {
int64_t c_per_group = dims[1];
int64_t c_stride = strides[1];
dims.insert(dims.begin() + 1, group_count);
strides.insert(strides.begin() + 1, c_stride * c_per_group);
}
cudnn_frontend::TensorBuilder builder;
builder.setDim(dims.size(), dims.data())
.setStride(strides.size(), strides.data())
.setId(id)
.setAlignment(alignment)
.setDataType(dtype);
if (is_virtual) {
builder.setVirtual();
}
return builder.build();
}
static cudnn_frontend::ConvDesc_v8 GetConvDescriptor( static cudnn_frontend::ConvDesc_v8 GetConvDescriptor(
cudnnDataType_t dataType, cudnnDataType_t dataType,
const std::vector<int>& padding, const std::vector<int>& padding,
...@@ -157,44 +185,26 @@ class CudnnFrontendConvHelper { ...@@ -157,44 +185,26 @@ class CudnnFrontendConvHelper {
cudnn_frontend::OperationGraph* op_graph_pointer, cudnn_frontend::OperationGraph* op_graph_pointer,
bool exhaustive_search, bool exhaustive_search,
bool deterministic, bool deterministic,
void* x_data, std::vector<void*>* data_ptrs,
void* y_data, std::vector<int64_t>* uids,
void* w_data,
cudnnHandle_t handle, cudnnHandle_t handle,
phi::DnnWorkspaceHandle* workspace_handle) { phi::DnnWorkspaceHandle* workspace_handle) {
auto heurgen_method = [=](cudnn_frontend::OperationGraph& op_graph_) auto heurgen_method = [=](cudnn_frontend::OperationGraph& op_graph_)
-> cudnn_frontend::EngineConfigList { -> cudnn_frontend::EngineConfigList {
auto heuristics = cudnn_frontend::EngineHeuristicsBuilder()
.setOperationGraph(op_graph_)
.setHeurMode(CUDNN_HEUR_MODE_INSTANT)
.build();
VLOG(4) << "Heuristic has " << heuristics.getEngineConfigCount()
<< " configurations ";
auto& engine_configs =
heuristics.getEngineConfig(heuristics.getEngineConfigCount());
cudnn_frontend::EngineConfigList filtered_configs;
cudnn_frontend::filter(engine_configs,
filtered_configs,
deterministic ? IsNonDeterministic : AllowAll);
return filtered_configs;
};
auto fallback_method = [=](cudnn_frontend::OperationGraph& op_graph_)
-> cudnn_frontend::EngineConfigList {
auto fallback = cudnn_frontend::EngineFallbackListBuilder()
.setOperationGraph(op_graph_)
.build();
auto& fallback_list = fallback.getFallbackList();
cudnn_frontend::EngineConfigList filtered_configs; cudnn_frontend::EngineConfigList filtered_configs;
cudnn_frontend::filter(fallback_list, auto statuses = cudnn_frontend::get_heuristics_list<2>(
{"heuristics_instant", "heuristics_fallback"},
op_graph_,
deterministic ? IsNonDeterministic : AllowAll,
filtered_configs, filtered_configs,
deterministic ? IsNonDeterministic : AllowAll); true);
VLOG(6) << "Filter config list has " << filtered_configs.size()
<< " configurations ";
return filtered_configs; return filtered_configs;
}; };
std::array<cudnn_frontend::GeneratorSource const, 2> sources = { std::array<cudnn_frontend::GeneratorSource const, 1> sources = {
heurgen_method, fallback_method}; heurgen_method};
cudnn_frontend::EngineConfigGenerator generator(sources.size(), cudnn_frontend::EngineConfigGenerator generator(sources.size(),
sources.data()); sources.data());
...@@ -204,29 +214,18 @@ class CudnnFrontendConvHelper { ...@@ -204,29 +214,18 @@ class CudnnFrontendConvHelper {
[=](cudnn_frontend::ExecutionPlan const& plan) -> bool { [=](cudnn_frontend::ExecutionPlan const& plan) -> bool {
return plan.getWorkspaceSize() > workspace_size_limit; return plan.getWorkspaceSize() > workspace_size_limit;
}; };
VLOG(6) << "[cudnn_frontend] Max workspace size: " << workspace_size_limit;
auto plans = cudnn_frontend::executionPlans_t plans;
generator.cudnnGetPlan(handle, *op_graph_pointer, predicate_function);
bool use_autotune = phi::autotune::AutoTuneStatus::Instance().UseAutoTune(); bool use_autotune = phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (!deterministic && (exhaustive_search || use_autotune)) { if (!deterministic && (exhaustive_search || use_autotune)) {
size_t workspace_size_max = 0;
std::for_each(
plans.begin(), plans.end(), [&](cudnn_frontend::ExecutionPlan& opt) {
if (opt.getWorkspaceSize() > workspace_size_max) {
workspace_size_max = opt.getWorkspaceSize();
}
});
VLOG(6) << "[cudnn_frontend] Max workspace size: " << workspace_size_max;
workspace_handle->RunFunc( workspace_handle->RunFunc(
[&](void* workspace_ptr) { [&](void* workspace_ptr) {
void* data_ptrs[] = {x_data, y_data, w_data}; auto variant_pack =
int64_t uids[] = {'x', 'y', 'w'}; cudnn_frontend::VariantPackBuilder()
auto variant_pack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr) .setWorkspacePointer(workspace_ptr)
.setDataPointers(3, data_ptrs) .setDataPointers(data_ptrs->size(), data_ptrs->data())
.setUids(3, uids) .setUids(uids->size(), uids->data())
.build(); .build();
plans = plans =
generator generator
...@@ -237,7 +236,10 @@ class CudnnFrontendConvHelper { ...@@ -237,7 +236,10 @@ class CudnnFrontendConvHelper {
variant_pack, variant_pack,
predicate_function); predicate_function);
}, },
workspace_size_max); workspace_size_limit);
} else {
plans =
generator.cudnnGetPlan(handle, *op_graph_pointer, predicate_function);
} }
std::for_each( std::for_each(
...@@ -249,6 +251,146 @@ class CudnnFrontendConvHelper { ...@@ -249,6 +251,146 @@ class CudnnFrontendConvHelper {
return plans; return plans;
} }
static cudnn_frontend::executionPlans_t FindExecutionPlans(
cudnn_frontend::OperationGraph* op_graph_pointer,
bool exhaustive_search,
bool deterministic,
void* x_data,
void* y_data,
void* w_data,
cudnnHandle_t handle,
phi::DnnWorkspaceHandle* workspace_handle) {
std::vector<void*> data_ptrs({x_data, y_data, w_data});
std::vector<int64_t> uids({'x', 'y', 'w'});
return FindExecutionPlans(op_graph_pointer,
exhaustive_search,
deterministic,
&data_ptrs,
&uids,
handle,
workspace_handle);
}
static void ExecutePlan(cudnnHandle_t handle_,
phi::DnnWorkspaceHandle* workspace_handle,
std::vector<void*>* data_ptrs,
std::vector<int64_t>* uids,
cudnnBackendDescriptor_t plan_desc,
int64_t workspace_size) {
workspace_handle->RunFunc(
[&](void* workspace_ptr) {
auto variant_pack =
cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(data_ptrs->size(), data_ptrs->data())
.setUids(uids->size(), uids->data())
.build();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
handle_, plan_desc, variant_pack.get_raw_desc()));
},
workspace_size);
}
static void ExecutePlan(cudnnHandle_t handle_,
phi::DnnWorkspaceHandle* workspace_handle,
void* x_data,
void* y_data,
void* w_data,
cudnnBackendDescriptor_t plan_desc,
int64_t workspace_size) {
std::vector<void*> data_ptrs({x_data, y_data, w_data});
std::vector<int64_t> uids({'x', 'y', 'w'});
ExecutePlan(handle_,
workspace_handle,
&data_ptrs,
&uids,
plan_desc,
workspace_size);
}
static void ExecutePlansAndCache(
cudnnHandle_t handle_,
phi::DnnWorkspaceHandle* workspace_handle,
std::vector<void*>* data_ptrs,
std::vector<int64_t>* uids,
cudnn_frontend::executionPlans_t* plans,
bool exhaustive_search,
const cudnn_frontend::feature_vector_t& feature_vector,
phi::autotune::CudnnFrontendPlanCache* plan_cache) {
for (auto& plan : *plans) {
try {
ExecutePlan(handle_,
workspace_handle,
data_ptrs,
uids,
plan.get_raw_desc(),
plan.getWorkspaceSize());
if (!exhaustive_search ||
plan_cache->IsStable(feature_vector, plan.getTag(), handle_)) {
plan_cache->InsertPlan(feature_vector, plan, handle_);
}
return;
} catch (cudnn_frontend::cudnnException& e) {
VLOG(4) << "Plan " << plan.describe()
<< "failed to execute. Trying next plan.";
} catch (phi::enforce::EnforceNotMet& e) {
VLOG(4) << "Plan " << plan.describe()
<< "failed to execute. Trying next plan.";
}
}
PADDLE_THROW(phi::errors::InvalidArgument(
"[CUDNN Frontend API] No valid plan could "
"be found to execute. Try setting FLAGS_conv_workspace_size_limit "
"higher."));
}
static void ExecutePlansAndCache(
cudnnHandle_t handle_,
phi::DnnWorkspaceHandle* workspace_handle,
void* x_data,
void* y_data,
void* w_data,
cudnn_frontend::executionPlans_t* plans,
bool exhaustive_search,
const cudnn_frontend::OperationGraph& op_graph,
phi::autotune::CudnnFrontendPlanCache* plan_cache) {
std::vector<void*> data_ptrs({x_data, y_data, w_data});
std::vector<int64_t> uids({'x', 'y', 'w'});
ExecutePlansAndCache(handle_,
workspace_handle,
&data_ptrs,
&uids,
plans,
exhaustive_search,
op_graph.getFeatureVector(),
plan_cache);
}
static cudnn_frontend::Operation MakePointwiseOp(
cudnnPointwiseMode_t mode,
cudnnDataType_t dtype,
cudnn_frontend::Tensor const& x_desc,
cudnn_frontend::Tensor const& b_desc,
cudnn_frontend::Tensor const& y_desc,
float alpha1 = 1.0,
float alpha2 = 1.0) {
auto op_desc = cudnn_frontend::PointWiseDescBuilder()
.setMode(mode)
.setComputeType(dtype)
.build();
auto op = cudnn_frontend::OperationBuilder(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(x_desc)
.setbDesc(b_desc)
.setyDesc(y_desc)
.setpwDesc(op_desc)
.setAlpha(alpha1)
.setAlpha2(alpha2)
.build();
VLOG(6) << op.describe();
return op;
}
}; // class CudnnFrontendConvHelper }; // class CudnnFrontendConvHelper
template <typename T> template <typename T>
...@@ -290,28 +432,17 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor, ...@@ -290,28 +432,17 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor,
alpha, alpha,
beta); beta);
if (plan_cache_bwd_data.FindPlan(op_graph, use_addto)) { if (plan_cache_bwd_data.FindPlan(op_graph, handle)) {
auto engine_config = const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
plan_cache_bwd_data.GetConfig(op_graph, handle, use_addto); int64_t workspace_size = 0;
auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() plan_cache_bwd_data.GetPlan(
.setHandle(handle) op_graph, &cached_plan, &workspace_size, handle);
.setEngineConfig(engine_config, op_graph.getTag()) helper::ExecutePlan(handle,
.build(); workspace_handle,
auto workspace_size = cached_plan.getWorkspaceSize(); dx_tensor_data,
VLOG(4) << "Cached execution plan found." << cached_plan.getTag() dy_tensor_data,
<< "; Require workspace: " << workspace_size; w_tensor_data,
workspace_handle->RunFunc( cached_plan->get_raw_desc(),
[&](void* workspace_ptr) {
void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data};
int64_t uids[] = {'x', 'y', 'w'};
auto variant_pack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
.build();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc()));
},
workspace_size); workspace_size);
return; return;
} }
...@@ -325,34 +456,15 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor, ...@@ -325,34 +456,15 @@ void CudnnConvBwdDataV8(const DenseTensor* dy_tensor,
handle, handle,
workspace_handle); workspace_handle);
for (auto& plan : plans) { helper::ExecutePlansAndCache(handle,
try { workspace_handle,
int64_t workspace_size = plan.getWorkspaceSize(); dx_tensor_data,
workspace_handle->RunFunc( dy_tensor_data,
[&](void* workspace_ptr) { w_tensor_data,
void* data_ptrs[] = {dx_tensor_data, dy_tensor_data, w_tensor_data}; &plans,
int64_t uids[] = {'x', 'y', 'w'}; exhaustive_search,
auto variant_pack = cudnn_frontend::VariantPackBuilder() op_graph,
.setWorkspacePointer(workspace_ptr) &plan_cache_bwd_data);
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
.build();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
handle, plan.get_raw_desc(), variant_pack.get_raw_desc()));
},
workspace_size);
if (!exhaustive_search ||
plan_cache_bwd_data.IsStable(op_graph, plan.getTag(), use_addto)) {
plan_cache_bwd_data.InsertPlan(op_graph, plan, use_addto);
}
return;
} catch (cudnn_frontend::cudnnException& e) {
} catch (phi::enforce::EnforceNotMet& e) {
}
}
PADDLE_THROW(
phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could "
"be found to execute conv backward data."));
} }
template <typename T> template <typename T>
...@@ -394,27 +506,17 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor, ...@@ -394,27 +506,17 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor,
alpha, alpha,
beta); beta);
if (plan_cache_bwd_filter.FindPlan(op_graph)) { if (plan_cache_bwd_filter.FindPlan(op_graph, handle)) {
auto engine_config = plan_cache_bwd_filter.GetConfig(op_graph, handle); const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() int64_t workspace_size = 0;
.setHandle(handle) plan_cache_bwd_filter.GetPlan(
.setEngineConfig(engine_config, op_graph.getTag()) op_graph, &cached_plan, &workspace_size, handle);
.build(); helper::ExecutePlan(handle,
auto workspace_size = cached_plan.getWorkspaceSize(); workspace_handle,
VLOG(4) << "Cached execution plan found." << cached_plan.getTag() x_tensor_data,
<< "; Require workspace: " << workspace_size; dy_tensor_data,
workspace_handle->RunFunc( dw_tensor_data,
[&](void* workspace_ptr) { cached_plan->get_raw_desc(),
void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data};
int64_t uids[] = {'x', 'y', 'w'};
auto variant_pack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
.build();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc()));
},
workspace_size); workspace_size);
return; return;
} }
...@@ -428,39 +530,15 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor, ...@@ -428,39 +530,15 @@ void CudnnConvBwdFilterV8(const DenseTensor* x_tensor,
handle, handle,
workspace_handle); workspace_handle);
for (auto& plan : plans) { helper::ExecutePlansAndCache(handle,
try { workspace_handle,
int64_t workspace_size = plan.getWorkspaceSize(); x_tensor_data,
workspace_handle->RunFunc( dy_tensor_data,
[&](void* workspace_ptr) { dw_tensor_data,
void* data_ptrs[] = {x_tensor_data, dy_tensor_data, dw_tensor_data}; &plans,
int64_t uids[] = {'x', 'y', 'w'}; exhaustive_search,
auto variant_pack = cudnn_frontend::VariantPackBuilder() op_graph,
.setWorkspacePointer(workspace_ptr) &plan_cache_bwd_filter);
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
.build();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
handle, plan.get_raw_desc(), variant_pack.get_raw_desc()));
},
workspace_size);
if (!exhaustive_search ||
plan_cache_bwd_filter.IsStable(op_graph, plan.getTag())) {
plan_cache_bwd_filter.InsertPlan(op_graph, plan);
}
return;
} catch (cudnn_frontend::cudnnException& e) {
VLOG(4) << "Plan " << plan.describe()
<< "failed to execute. Trying next plan.";
} catch (phi::enforce::EnforceNotMet& e) {
VLOG(4) << "Plan " << plan.describe()
<< "failed to execute. Trying next plan.";
}
}
PADDLE_THROW(phi::errors::InvalidArgument(
"[CUDNN Frontend API] No valid plan could "
"be found to execute conv backward filter."));
} }
} // namespace phi } // namespace phi
...@@ -261,27 +261,16 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor, ...@@ -261,27 +261,16 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
alpha, alpha,
beta); beta);
if (plan_cache.FindPlan(op_graph)) { if (plan_cache.FindPlan(op_graph, handle)) {
auto engine_config = plan_cache.GetConfig(op_graph, handle); const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
auto cached_plan = cudnn_frontend::ExecutionPlanBuilder() int64_t workspace_size = 0;
.setHandle(handle) plan_cache.GetPlan(op_graph, &cached_plan, &workspace_size, handle);
.setEngineConfig(engine_config, op_graph.getTag()) helper::ExecutePlan(handle,
.build(); &workspace_handle,
auto workspace_size = cached_plan.getWorkspaceSize(); input_data,
VLOG(4) << "Cached execution plan found." << cached_plan.getTag() output_data,
<< "; Require workspace: " << workspace_size; filter_data,
workspace_handle.RunFunc( cached_plan->get_raw_desc(),
[&](void* workspace_ptr) {
void* data_ptrs[] = {input_data, output_data, filter_data};
int64_t uids[] = {'x', 'y', 'w'};
auto variant_pack = cudnn_frontend::VariantPackBuilder()
.setWorkspacePointer(workspace_ptr)
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
.build();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
handle, cached_plan.get_raw_desc(), variant_pack.get_raw_desc()));
},
workspace_size); workspace_size);
return; return;
} }
...@@ -295,37 +284,15 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor, ...@@ -295,37 +284,15 @@ void ConvCudnnKernelImplV8(const DenseTensor* input_tensor,
handle, handle,
&workspace_handle); &workspace_handle);
for (auto& plan : plans) { helper::ExecutePlansAndCache(handle,
try { &workspace_handle,
int64_t workspace_size = plan.getWorkspaceSize(); input_data,
workspace_handle.RunFunc( output_data,
[&](void* workspace_ptr) { filter_data,
void* data_ptrs[] = {input_data, output_data, filter_data}; &plans,
int64_t uids[] = {'x', 'y', 'w'}; exhaustive_search,
auto variant_pack = cudnn_frontend::VariantPackBuilder() op_graph,
.setWorkspacePointer(workspace_ptr) &plan_cache);
.setDataPointers(3, data_ptrs)
.setUids(3, uids)
.build();
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBackendExecute(
handle, plan.get_raw_desc(), variant_pack.get_raw_desc()));
},
workspace_size);
if (!exhaustive_search || plan_cache.IsStable(op_graph, plan.getTag())) {
plan_cache.InsertPlan(op_graph, plan);
}
return;
} catch (cudnn_frontend::cudnnException& e) {
VLOG(4) << "Plan " << plan.describe()
<< "failed to execute. Trying next plan.";
} catch (phi::enforce::EnforceNotMet& e) {
VLOG(4) << "Plan " << plan.describe()
<< "failed to execute. Trying next plan.";
}
}
PADDLE_THROW(
phi::errors::InvalidArgument("[CUDNN Frontend API] No valid plan could "
"be found to execute conv."));
} }
#endif #endif
......
From dce3465da518641ee177187fbc0c0d36faea28f2 Mon Sep 17 00:00:00 2001
From: Tian Zheng <tizheng@nvidia.com>
Date: Thu, 27 Oct 2022 20:33:16 -0700
Subject: [PATCH] patch for paddle
---
include/cudnn_frontend_ExecutionPlan.h | 10 +++++++---
include/cudnn_frontend_ExecutionPlanCache.h | 2 +-
include/cudnn_frontend_OperationGraph.h | 2 +-
include/cudnn_frontend_find_plan.h | 6 +++---
include/cudnn_frontend_get_plan.h | 4 ++--
5 files changed, 14 insertions(+), 10 deletions(-)
diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h
index 7bed4b4..3314b5c 100644
--- a/include/cudnn_frontend_ExecutionPlan.h
+++ b/include/cudnn_frontend_ExecutionPlan.h
@@ -167,6 +167,10 @@ class ExecutionPlan_v8 : public BackendDescriptor {
return json_string;
#endif
}
+
+ ManagedOpaqueDescriptor GetEngineConfig() const {
+ return engine_config;
+ }
ExecutionPlan_v8(ExecutionPlan_v8 const &) = default;
ExecutionPlan_v8 &
@@ -182,7 +186,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
CUDNN_TYPE_NUMERICAL_NOTE,
CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
&elem_count,
- NULL);
+ nullptr);
numeric_notes_vec.resize(elem_count);
status = cudnnBackendGetAttribute(extractedEngine_,
CUDNN_ATTR_ENGINE_NUMERICAL_NOTE,
@@ -206,7 +210,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
CUDNN_TYPE_BEHAVIOR_NOTE,
CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
&elem_count,
- NULL);
+ nullptr);
behavior_notes_vec.resize(elem_count);
status = cudnnBackendGetAttribute(extractedEngine_,
CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE,
@@ -310,7 +314,7 @@ class ExecutionPlan_v8 : public BackendDescriptor {
CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE,
CUDNN_TYPE_INT64,
1,
- NULL,
+ nullptr,
&workSpaceSize);
if (status != CUDNN_STATUS_SUCCESS) {
set_error_and_throw_exception(this,
diff --git a/include/cudnn_frontend_ExecutionPlanCache.h b/include/cudnn_frontend_ExecutionPlanCache.h
index 99a157c..741c490 100644
--- a/include/cudnn_frontend_ExecutionPlanCache.h
+++ b/include/cudnn_frontend_ExecutionPlanCache.h
@@ -94,7 +94,7 @@ class ExecutionPlanCache_v1 {
/// String to map of feature_vector to execution plan
/// For a given FeatureVector of type T according to the Operation Graph, we get the plan.
- using FeatureVectorToPlanMap = std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan, cudnn_frontend::ExecutionPlanCache_v1::compare>;
+ using FeatureVectorToPlanMap = std::map<cudnn_frontend::feature_vector_t, cudnn_frontend::ExecutionPlan>;
FeatureVectorToPlanMap cache;
mutable std::mutex cache_mutex;
diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
index 1478ce8..7894080 100644
--- a/include/cudnn_frontend_OperationGraph.h
+++ b/include/cudnn_frontend_OperationGraph.h
@@ -78,7 +78,7 @@ class OperationGraph_v8 : public BackendDescriptor {
CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT,
CUDNN_TYPE_INT64,
1,
- NULL,
+ nullptr,
&global_count);
if (status != CUDNN_STATUS_SUCCESS) {
set_error_and_throw_exception(this,
diff --git a/include/cudnn_frontend_find_plan.h b/include/cudnn_frontend_find_plan.h
index 02a08a1..5f94e45 100644
--- a/include/cudnn_frontend_find_plan.h
+++ b/include/cudnn_frontend_find_plan.h
@@ -53,7 +53,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
cudaDeviceSynchronize();
cudaStream_t stream = nullptr;
- ::cudnnGetStream(handle, &stream);
+ cudnnGetStream(handle, &stream);
for (auto &plan : plans) {
float time_ms = 0.0f;
@@ -61,7 +61,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
float min_time_ms = std::numeric_limits<float>::max();
// Warm-up run
- auto warmup_status = ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+ auto warmup_status = cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
if (warmup_status != CUDNN_STATUS_SUCCESS) {
getLogger() << "[cudnn_frontend] Plan " << plan.getTag() << " failed with " << to_string(warmup_status) << std::endl;
continue;
@@ -71,7 +71,7 @@ time_sorted_plan(cudnnHandle_t handle, executionPlans_t plans, VariantPack const
for (int i = 0; i < maxIterCount; i++) {
cudaEventRecord(start, stream);
- ::cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
+ cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc());
cudaEventRecord(stop, stream);
cudaEventSynchronize(stop);
diff --git a/include/cudnn_frontend_get_plan.h b/include/cudnn_frontend_get_plan.h
index 50535ab..c43eec9 100644
--- a/include/cudnn_frontend_get_plan.h
+++ b/include/cudnn_frontend_get_plan.h
@@ -26,7 +26,7 @@
namespace cudnn_frontend {
-auto
+inline auto
EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph)
-> executionPlans_t {
// Creating a set of execution plans that are supported.
@@ -47,7 +47,7 @@ EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGra
return plans;
}
-auto
+inline auto
EngineConfigGenerator::cudnnGetPlan(cudnnHandle_t handle, OperationGraph & opGraph, Predicate pred)
-> executionPlans_t {
// Creating a set of execution plans that are supported.
--
2.25.1
...@@ -141,6 +141,10 @@ class TestStaticAutoTuneStatus(TestAutoTune): ...@@ -141,6 +141,10 @@ class TestStaticAutoTuneStatus(TestAutoTune):
exe.run(startup_program) exe.run(startup_program)
x = np.random.random(size=data_shape).astype('float32') x = np.random.random(size=data_shape).astype('float32')
# Node(tizheng): warmup run to make sure the following runs
# are in the same thread. Necessary for CUDNNv8 tests
exe.run(program=main_program, feed={'X': x}, fetch_list=[loss])
self.set_flags(enable_autotune) self.set_flags(enable_autotune)
if enable_autotune: if enable_autotune:
config = {"kernel": {"enable": True, "tuning_range": [1, 2]}} config = {"kernel": {"enable": True, "tuning_range": [1, 2]}}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册