未验证 提交 b4adbe5c 编写于 作者: Y Yiqun Liu 提交者: GitHub

[Cherry-pick 2.3] Autotune the workspace and kernel choosing of conv (#41833)

Cherry-pick #40338 #41741 #41313
上级 a9d8b947
...@@ -15,7 +15,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) ...@@ -15,7 +15,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
add_subdirectory(pylayer) add_subdirectory(pylayer)
cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator)
add_dependencies(grad_tensor_holder eager_final_state_codegen) add_dependencies(grad_tensor_holder eager_final_state_codegen)
cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info switch_autotune)
endif() endif()
cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor)
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
namespace paddle { namespace paddle {
......
...@@ -9,8 +9,8 @@ cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_f ...@@ -9,8 +9,8 @@ cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_f
add_subdirectory(jit) add_subdirectory(jit)
cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper) cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper)
cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper)
cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator switch_autotune)
cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator switch_autotune)
cc_library(imperative_profiler SRCS profiler.cc DEPS flags) cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
if(NOT WIN32) if(NOT WIN32)
if(WITH_NCCL OR WITH_RCCL) if(WITH_NCCL OR WITH_RCCL)
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <array>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/autotune/cache.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DataLayout = platform::DataLayout;
using framework::AlgorithmsCache;
using framework::ConvSearchCache;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
// As the basic for SearchAlgorithm struct.
template <typename PerfT>
struct SearchAlgorithm {};
// As the container of searchAlgorithm::Find() result.
template <typename AlgoT>
struct SearchResult {
SearchResult() {}
explicit SearchResult(AlgoT a) : algo(a) {}
AlgoT algo = static_cast<AlgoT>(0);
float time = -1.f;
size_t workspace_size = 0;
};
template <typename T>
static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
out << "[";
for (auto const& tmp : v) out << tmp << ",";
out << "]";
return out;
}
// As the container of conv relevant descriptors.
template <typename HandleT, typename DataT>
struct ConvArgsBase {
HandleT handle;
platform::TensorDescriptor idesc, odesc;
platform::FilterDescriptor wdesc;
platform::ConvolutionDescriptor cdesc;
const framework::Tensor *x, *w, *o;
DataT cudnn_dtype;
// strides
std::vector<int> s;
// paddings
std::vector<int> p;
// dilations
std::vector<int> d;
ConvArgsBase(const framework::Tensor* x, const framework::Tensor* w,
const framework::Tensor* o, const std::vector<int> s,
const std::vector<int> p, const std::vector<int> d, DataT dtype)
: x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
template <typename T>
size_t GetCacheKey() const {
auto x_shape = phi::vectorize(x->dims());
auto w_shape = phi::vectorize(w->dims());
VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
<< ", strides=" << s << ", paddings=" << p << ", dilations=" << d;
return phi::autotune::ConvKey(
x_shape, w_shape, p, s, d,
paddle::experimental::CppTypeToDataType<T>::Type());
}
};
static inline void GetNCDHW(const framework::DDim& dims,
const DataLayout& layout, int* N, int* C, int* D,
int* H, int* W) {
*N = dims[0];
*C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
int i = layout == DataLayout::kNCHW ? 0 : 1;
if (dims.size() == 5) {
*D = dims[2 - i];
*H = dims[3 - i];
*W = dims[4 - i];
} else {
*D = 1;
*H = dims[2 - i];
*W = dims[3 - i];
}
}
} // namespace operators
} // namespace paddle
...@@ -14,44 +14,17 @@ limitations under the License. */ ...@@ -14,44 +14,17 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include "paddle/fluid/operators/conv_base_helper.h"
#include <array>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using ConvArgs = ConvArgsBase<cudnnHandle_t, cudnnDataType_t>;
using DataLayout = platform::DataLayout;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
using framework::AlgorithmsCache;
static inline void GetNCDHW(const framework::DDim& dims,
const DataLayout& layout, int* N, int* C, int* D,
int* H, int* W) {
*N = dims[0];
*C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
int i = layout == DataLayout::kNCHW ? 0 : 1;
if (dims.size() == 5) {
*D = dims[2 - i];
*H = dims[3 - i];
*W = dims[4 - i];
} else {
*D = 1;
*H = dims[2 - i];
*W = dims[3 - i];
}
}
template <typename DeviceContext, typename T, size_t D> template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const phi::GPUContext& context, static void RemovePaddingSlice(const phi::GPUContext& context,
...@@ -68,121 +41,117 @@ static void RemovePaddingSlice(const phi::GPUContext& context, ...@@ -68,121 +41,117 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
extents[i] = new_out_dims[i]; extents[i] = new_out_dims[i];
} }
int start;
for (size_t i = 0; i < axes.size(); ++i) { for (size_t i = 0; i < axes.size(); ++i) {
start = starts[i]; int start = starts[i];
if (start < 0) { if (start < 0) {
start = (start + in_dims[axes[i]]); start = (start + in_dims[axes[i]]);
} }
start = std::max(start, 0); start = std::max(start, 0);
offsets[axes[i]] = start; offsets[axes[i]] = start;
} }
auto in_t = auto in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From( framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*input); *input);
auto out_t = auto out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From( framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims); *out, new_out_dims);
EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
offsets, extents); phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents);
} }
template <typename T> static inline double ToMegaBytes(size_t bytes) {
std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) { return static_cast<double>(bytes) / (1 << 20);
out << "[";
for (auto const& tmp : v) out << tmp << ",";
out << "]";
return out;
} }
inline int MaxBwdFilterAlgos(cudnnHandle_t cudnn_handle) { static inline bool UseFixedWorkspace() {
int max_algos = 0; return FLAGS_conv_workspace_size_limit >= 0;
#if CUDNN_VERSION_MIN(7, 0, 1)
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
cudnn_handle, &max_algos));
#endif
return max_algos;
} }
template <typename PerfType, typename AlgoType> static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
void ChooseAlgoByWorkspace(PerfType* perf_results, size_t perf_num, if (!use_fixed_workspace) {
size_t workspace_byte, AlgoType* algo) { int device_id = platform::GetCurrentDeviceId();
for (size_t i = 0; i < perf_num; ++i) { int64_t allocated = memory::StatGetCurrentValue("Allocated", device_id);
auto result = perf_results[i]; int64_t reserved = memory::StatGetCurrentValue("Reserved", device_id);
if (result.status == CUDNN_STATUS_SUCCESS && int64_t availble = platform::GpuAvailableMemToAlloc();
result.memory < workspace_byte) { VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
*algo = result.algo; << " MB, reserved=" << ToMegaBytes(reserved)
VLOG(3) << " algo: " << result.algo << ", time: " << result.time << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
<< " ms, wksp = " << result.memory return std::max(availble, reserved - allocated);
<< ", status = " << result.status; } else {
return; return FLAGS_conv_workspace_size_limit * 1024 * 1024;
}
} }
VLOG(3) << "Can not find alog that requires memory < "
<< static_cast<double>(workspace_byte) / (1 << 20) << " MB";
} }
template <typename PerfType, typename AlgoType> template <typename PerfT>
void ChooseAlgo(const std::vector<PerfType>& perf_results, std::string GetPerfResultString(std::string prefix,
size_t workspace_byte, AlgoType* algo) { const std::vector<PerfT>& perf_results,
VLOG(3) << "=========BwdFilterAlgo Perf result========="; int actual_algo_count, size_t workspace_limit) {
for (const auto& result : perf_results) { std::ostringstream out;
auto math_type_str = "False"; out << prefix << " (workspace limit=" << ToMegaBytes(workspace_limit)
if (result.mathType == CUDNN_TENSOR_OP_MATH) { << " MB):\n";
math_type_str = "True"; for (int i = 0; i < actual_algo_count; ++i) {
} const auto& result = perf_results[i];
VLOG(3) << " algo: " << result.algo << ", TensorCore: " << math_type_str auto math_type_str = (result.mathType == CUDNN_TENSOR_OP_MATH) ? "T" : "F";
<< ", time: " << result.time << " ms" out << " algo=" << result.algo << ": tensor_core=" << math_type_str
<< ", wksp = " << result.memory << ", status = " << result.status; << ", time=" << result.time
<< " ms, memory=" << ToMegaBytes(result.memory)
<< " MB, status=" << result.status << "\n";
} }
return out.str();
}
for (size_t i = 0; i != perf_results.size(); ++i) { // Choose an algorithm which has the minimize time cost and less memory.
const auto& result = perf_results[i]; // NOTE: perf_results is ordered by time.
template <typename PerfT, typename AlgoT>
void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
size_t workspace_limit,
SearchResult<AlgoT>* search_result) {
int best_algo_idx = -1;
for (size_t i = 0; i < perf_results.size(); ++i) {
auto result = perf_results[i];
if (result.status == CUDNN_STATUS_SUCCESS && if (result.status == CUDNN_STATUS_SUCCESS &&
(result.memory <= workspace_byte)) { result.memory < workspace_limit) {
if ((result.mathType == CUDNN_TENSOR_OP_MATH) && if (best_algo_idx == -1) {
(i != perf_results.size() - 1)) { // The algorithm which has minimize time cost and need a workspace_size
const auto& next_result = perf_results[i + 1]; // fitting the workspace_limit constraint.
if (next_result.status == CUDNN_STATUS_SUCCESS && best_algo_idx = i;
next_result.algo == result.algo && // Each perf_results[i].time is set to be -1 in heuristic search.
next_result.memory == result.memory && if (perf_results[best_algo_idx].time < 0) {
next_result.mathType != CUDNN_TENSOR_OP_MATH && break;
next_result.time < 1.01 * result.time) { }
// Skip over this result- it's not really a Tensor Core algo. } else {
// Because it is only 1% performance difference. float best_algo_time = perf_results[best_algo_idx].time;
// Prefer to choose the next equivalent non-Tensor Core algo. if ((result.time - best_algo_time) / best_algo_time < 0.01) {
continue; best_algo_idx = (result.memory < perf_results[best_algo_idx].memory)
? i
: best_algo_idx;
break;
} }
} }
*algo = result.algo;
auto math_type_str = "0";
if (result.mathType == CUDNN_TENSOR_OP_MATH) {
math_type_str = "1";
}
VLOG(3) << " choose algo: " << result.algo << ", TC: " << math_type_str
<< ", time: " << result.time << " ms"
<< ", wksp = " << result.memory << ", status = " << result.status;
break;
} }
} }
if (best_algo_idx != -1) {
search_result->algo = perf_results[best_algo_idx].algo;
search_result->time = perf_results[best_algo_idx].time;
search_result->workspace_size = perf_results[best_algo_idx].memory;
} else {
VLOG(3) << "Can not find an algorithm that requires memory < "
<< ToMegaBytes(workspace_limit) << " MB";
}
} }
using framework::ConvSearchCache;
static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
const platform::ConvolutionDescriptor& cdesc) { const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto& dev_ctx = ctx; if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH)); cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math"; VLOG(5) << "use cudnn_tensor_op_math";
#if CUDA_VERSION >= 11000 #if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0) #if CUDNN_VERSION_MIN(8, 1, 0)
} else if (dev_ctx.GetComputeCapability() >= 80 && } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
dtype == CUDNN_DATA_BFLOAT16) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH)); cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif // CUDNN_VERSION_MIN(8, 1, 0) #endif // CUDNN_VERSION_MIN(8, 1, 0)
...@@ -198,411 +167,593 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, ...@@ -198,411 +167,593 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
#endif #endif
} }
struct ConvArgs { // cuDNN convolution forward algorithm searcher, consisted of three searching
cudnnHandle_t handle; // modes, namely: deterministic, heuristic and exhaustive_search mode.
platform::TensorDescriptor idesc, odesc; // As well as one workspace size acquirsition function with respect to
platform::FilterDescriptor wdesc; // the chosen alogrithm.
platform::ConvolutionDescriptor cdesc;
const framework::Tensor *x, *w, *o;
cudnnDataType_t cudnn_dtype;
// strides
std::vector<int> s;
// paddings
std::vector<int> p;
// dilations
std::vector<int> d;
ConvArgs(const framework::Tensor* x, const framework::Tensor* w,
const framework::Tensor* o, const std::vector<int> s,
const std::vector<int> p, const std::vector<int> d,
cudnnDataType_t dtype)
: x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
};
template <typename perf_t>
struct SearchAlgorithm {};
template <> template <>
struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
using perf_t = cudnnConvolutionFwdAlgoPerf_t; using PerfT = cudnnConvolutionFwdAlgoPerf_t;
using algo_t = cudnnConvolutionFwdAlgo_t; using AlgoT = cudnnConvolutionFwdAlgo_t;
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, const phi::GPUContext& ctx) { bool deterministic,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type; auto dtype = platform::CudnnDataType<T>::type;
bool has_got_workspace_size = true;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
algo_t algo;
SetConvMathType(ctx, dtype, args.cdesc); SetConvMathType(ctx, dtype, args.cdesc);
if (!exhaustive_search && !deterministic) { if (deterministic) {
result = FindAlgoDeterministic();
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t key = args.GetCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
if (cache.Find(key)) {
result.algo = static_cast<AlgoT>(cache.Get(key));
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
cache.Set(key, static_cast<int64_t>(result.algo));
} else {
result = FindAlgoHeuristic(args, ctx);
}
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo << ", workspace="
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
return result;
}
static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionFwdAlgo_t algo) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size));
return workspace_size;
}
private:
static SearchResult<AlgoT> FindAlgoDeterministic() {
return SearchResult<AlgoT>(static_cast<AlgoT>(1));
}
// Heuristic search mode, calling the cudnnGetXxxAlgorithm.
static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
int perf_count; int actual_perf_count;
int best_algo_idx = 0; int best_algo_idx = 0;
std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
args.handle, args.idesc.desc(), args.wdesc.desc(), args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, args.cdesc.desc(), args.odesc.desc(), kNUM_CUDNN_FWD_ALGS,
&perf_count, perf_results.get())); &actual_perf_count, perf_results.data()));
algo = (perf_results.get())[best_algo_idx].algo; result.algo = perf_results[best_algo_idx].algo;
workspace_size = (perf_results.get())[best_algo_idx].memory; result.workspace_size = perf_results[best_algo_idx].memory;
if (workspace_size > workspace_size_limit) { if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000 #if CUDNN_VERSION >= 8000
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(), ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
kNUM_CUDNN_FWD_ALGS, &result);
workspace_size_limit, &algo);
#else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#endif
}
#else #else
VLOG(3) << "Fallback to non-v7 method to find conv algorithm "
"becasue the workspace size request("
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardAlgorithm( platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.idesc.desc(), args.wdesc.desc(), args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.odesc.desc(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo)); workspace_size_limit, &(result.algo)));
#endif #endif
VLOG(3) << "choose algo " << algo;
} else if (deterministic) {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(1);
} else {
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetForward());
auto x_dims = phi::vectorize(args.x->dims());
auto w_dims = phi::vectorize(args.w->dims());
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
<< args.s << ", args.p" << args.p << ", args.d" << args.d;
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() {
int returned_algo_count;
std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(),
args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return perf_stat[0].algo;
});
} }
VLOG(3) << "choose algo " << algo; #else
return algo;
}
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( platform::dynload::cudnnGetConvolutionForwardAlgorithm(
args.handle, args.idesc.desc(), args.wdesc.desc(), args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(), algo, &workspace_size)); args.cdesc.desc(), args.odesc.desc(),
return workspace_size; CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit,
&(result.algo)));
#endif
return result;
}
template <typename T>
static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
const ConvArgs& args, const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());
size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
VLOG(4) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
<< " MB";
int returned_algo_count;
std::vector<PerfT> perf_results(kNUM_CUDNN_FWD_ALGS);
auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(),
args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_results.data(),
workspace_ptr, max_workspace_size));
};
auto workspace_handle = ctx.cudnn_workspace_handle();
workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
UseFixedWorkspace());
VLOG(4) << GetPerfResultString<PerfT>(
"[Exhaustive Search] FwdAlgo Perf result", perf_results,
returned_algo_count, workspace_size_limit);
ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
&result);
return result;
}
static size_t GetMaxWorkspaceSize(const ConvArgs& args,
size_t workspace_size_limit) {
if (!UseFixedWorkspace()) {
size_t max_workspace_size = 0;
for (size_t algo = 0; algo < kNUM_CUDNN_FWD_ALGS; ++algo) {
size_t workspace_size = 0;
auto status =
platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
args.handle, args.idesc.desc(), args.wdesc.desc(),
args.cdesc.desc(), args.odesc.desc(),
static_cast<cudnnConvolutionFwdAlgo_t>(algo), &workspace_size);
if (status == CUDNN_STATUS_SUCCESS &&
workspace_size <= workspace_size_limit) {
max_workspace_size = std::max(workspace_size, max_workspace_size);
}
}
return max_workspace_size;
} else {
return workspace_size_limit;
}
} }
}; };
// cuDNN convolution backward data-algorithm searcher, consisting of three
// searching modes, namely: deterministic, heuristic, and exhaustive_search
// mode. Specially, there are 2 pattens of exhaustive search mode, one for
// HALF precision only, one for the rest.
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
template <> template <>
struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
using perf_t = cudnnConvolutionBwdDataAlgoPerf_t; using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
using algo_t = cudnnConvolutionBwdDataAlgo_t; using AlgoT = cudnnConvolutionBwdDataAlgo_t;
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, const phi::GPUContext& ctx) { bool deterministic,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type; auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
bool has_got_workspace_size = true;
algo_t algo;
SetConvMathType(ctx, dtype, args.cdesc); SetConvMathType(ctx, dtype, args.cdesc);
if (!exhaustive_search && !deterministic) { if (deterministic) {
result = FindAlgoDeterministic();
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t key = args.GetCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
if (cache.Find(key)) {
result.algo = static_cast<AlgoT>(cache.Get(key));
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
cache.Set(key, static_cast<int64_t>(result.algo));
} else {
result = FindAlgoHeuristic(args, ctx);
}
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo << ", workspace="
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
return result;
}
static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdDataAlgo_t algo) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size));
return workspace_size;
}
private:
static SearchResult<AlgoT> FindAlgoDeterministic() {
return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
}
static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
int perf_count; int actual_perf_count;
int best_algo_idx = 0; int best_algo_idx = 0;
std::unique_ptr<perf_t[]> perf_results( std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); PADDLE_ENFORCE_GPU_SUCCESS(
PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( args.handle, args.wdesc.desc(), args.odesc.desc(),
args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS,
args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, &actual_perf_count, perf_results.data()));
&perf_count, perf_results.get())); result.algo = perf_results[best_algo_idx].algo;
algo = (perf_results.get())[best_algo_idx].algo;
#if CUDNN_VERSION < 7500 #if CUDNN_VERSION < 7500
int stride_dim = args.x->dims().size() - 2; int stride_dim = args.x->dims().size() - 2;
bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim,
[=](int n) { return n != 1; }); [=](int n) { return n != 1; });
if (blacklist && (static_cast<cudnnConvolutionBwdDataAlgo_t>( if (blacklist && (perf_results[best_algo_idx].algo ==
perf_results[best_algo_idx].algo) == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || perf_results[best_algo_idx].algo ==
static_cast<cudnnConvolutionBwdDataAlgo_t>( CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
perf_results[best_algo_idx].algo) == result.algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { }
algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
}
#endif #endif
workspace_size = GetWorkspaceSize(args, algo); result.workspace_size = GetWorkspaceSize(args, result.algo);
if (workspace_size > workspace_size_limit) { if (result.workspace_size > workspace_size_limit) {
has_got_workspace_size = false;
#if CUDNN_VERSION >= 8000 #if CUDNN_VERSION >= 8000
// cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8 // cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(), ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
kNUM_CUDNN_BWD_DATA_ALGS, &result);
workspace_size_limit, &algo);
#else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#endif
}
#else #else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.wdesc.desc(), args.odesc.desc(), args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(), args.cdesc.desc(), args.idesc.desc(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo)); workspace_size_limit, &(result.algo)));
#endif #endif
} else if (deterministic) {
return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
} else {
auto& dev_ctx = ctx;
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
AlgorithmsCache<algo_t>& algo_cache =
*(framework::ConvSearchCache::Instance().GetBackwardData());
auto x_dims = phi::vectorize(args.x->dims());
auto w_dims = phi::vectorize(args.w->dims());
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t"
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
<< args.s << ", args.p" << args.p << ", args.d" << args.d;
algo = algo_cache.GetAlgorithm(
x_dims, w_dims, args.s, args.p, args.d, 0,
static_cast<int64_t>(args.cudnn_dtype), [&]() {
int returned_algo_count;
std::array<perf_t, kNUM_CUDNN_BWD_DATA_ALGS> perf_stat;
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnFindConvolutionBackwardDataAlgorithmEx(
args.handle, args.wdesc.desc(), args.w->data<T>(),
args.odesc.desc(), args.o->data<T>(),
args.cdesc.desc(), args.idesc.desc(),
const_cast<T*>(args.x->data<T>()),
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
perf_stat.data(), cudnn_workspace_ptr,
workspace_size_limit));
};
workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
for (int i = 0; i < returned_algo_count; ++i) {
const auto& stat = perf_stat[i];
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
<< " " << stat.memory;
}
return perf_stat[0].algo;
});
} }
VLOG(3) << "choose algo " << algo; #else
return algo;
}
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
args.handle, args.wdesc.desc(), args.odesc.desc(), args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size)); args.cdesc.desc(), args.idesc.desc(),
return workspace_size; CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &(result.algo)));
#endif
return result;
}
template <typename T>
static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
const ConvArgs& args, const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());
size_t max_workspace_size = GetMaxWorkspaceSize(args, workspace_size_limit);
VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
<< " MB";
int returned_algo_count;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_DATA_ALGS);
auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardDataAlgorithmEx(
args.handle, args.wdesc.desc(), args.w->data<T>(),
args.odesc.desc(), args.o->data<T>(), args.cdesc.desc(),
args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
perf_results.data(), workspace_ptr, max_workspace_size));
};
auto workspace_handle = ctx.cudnn_workspace_handle();
workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
UseFixedWorkspace());
VLOG(4) << GetPerfResultString<PerfT>(
"[Exhaustive Search] BwdDataAlgo Perf result", perf_results,
returned_algo_count, workspace_size_limit);
ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
&result);
return result;
}
static size_t GetMaxWorkspaceSize(const ConvArgs& args,
size_t workspace_size_limit) {
if (!UseFixedWorkspace()) {
size_t max_workspace_size = 0;
for (size_t algo = 0; algo < kNUM_CUDNN_BWD_DATA_ALGS; ++algo) {
size_t workspace_size = 0;
auto status =
platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
args.handle, args.wdesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.idesc.desc(),
static_cast<cudnnConvolutionBwdDataAlgo_t>(algo),
&workspace_size);
if (status == CUDNN_STATUS_SUCCESS &&
workspace_size <= workspace_size_limit) {
max_workspace_size = std::max(workspace_size, max_workspace_size);
}
}
return max_workspace_size;
} else {
return workspace_size_limit;
}
} }
}; };
// cuDNN convution backward filter-algorithm searcher, consisted of three
// algorithm searching modes, namely: deterministic, heuristic, and
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
template <> template <>
struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
using algo_t = cudnnConvolutionBwdFilterAlgo_t; using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
template <typename T> template <typename T>
static algo_t Find(const ConvArgs& args, bool exhaustive_search, static SearchResult<AlgoT> Find(const ConvArgs& args, bool exhaustive_search,
bool deterministic, const phi::GPUContext& ctx) { bool deterministic,
const phi::GPUContext& ctx) {
platform::CUDAGraphCaptureModeGuard guard; platform::CUDAGraphCaptureModeGuard guard;
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type; auto dtype = platform::CudnnDataType<T>::type;
size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
size_t workspace_size = 0;
bool has_got_workspace_size = true;
SetConvMathType(ctx, dtype, args.cdesc); SetConvMathType(ctx, dtype, args.cdesc);
algo_t algo; if (deterministic) {
if (!exhaustive_search && !deterministic) { result = FindAlgoDeterministic();
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t key = args.GetCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
if (cache.Find(key)) {
result.algo = static_cast<AlgoT>(cache.Get(key));
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
cache.Set(key, static_cast<int64_t>(result.algo));
} else {
result = FindAlgoHeuristic(args, ctx);
}
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo << ", workspace="
<< ToMegaBytes(GetWorkspaceSize(args, result.algo)) << " MB";
return result;
}
static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdFilterAlgo_t algo) {
platform::CUDAGraphCaptureModeGuard guard;
size_t workspace_size = 0;
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size));
return workspace_size;
}
private:
static SearchResult<AlgoT> FindAlgoDeterministic() {
return SearchResult<AlgoT>(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
}
static SearchResult<AlgoT> FindAlgoHeuristic(const ConvArgs& args,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; int actual_perf_count;
int perf_count; int best_algo_idx = 0;
int best_algo_idx = 0; std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
std::unique_ptr<perf_t[]> perf_results( PADDLE_ENFORCE_GPU_SUCCESS(
new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
PADDLE_ENFORCE_GPU_SUCCESS( args.handle, args.idesc.desc(), args.odesc.desc(),
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS,
args.handle, args.idesc.desc(), args.odesc.desc(), &actual_perf_count, perf_results.data()));
args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, result.algo = perf_results[best_algo_idx].algo;
&perf_count, perf_results.get())); result.workspace_size = perf_results[best_algo_idx].memory;
algo = (perf_results.get())[best_algo_idx].algo;
workspace_size = (perf_results.get())[best_algo_idx].memory;
if (workspace_size > workspace_size_limit) { if (result.workspace_size > workspace_size_limit) {
workspace_size = workspace_size_limit;
#if CUDNN_VERSION >= 8000 #if CUDNN_VERSION >= 8000
// cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8 // cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<perf_t, algo_t>(perf_results.get(), ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
kNUM_CUDNN_BWD_FILTER_ALGS, &result);
workspace_size_limit, &algo);
#else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo));
#endif
}
#else #else
VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<< result.workspace_size << ") exceeds the limit("
<< workspace_size_limit << ")";
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(), args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(), args.cdesc.desc(), args.wdesc.desc(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &algo)); workspace_size_limit, &(result.algo)));
#endif #endif
} else if (deterministic) { }
return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; #else
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
workspace_size_limit, &(result.algo)));
#endif
return result;
}
template <typename T>
static SearchResult<AlgoT> FindAlgoExhaustiveSearch(
const ConvArgs& args, const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
int returned_algo_count = 0;
std::vector<PerfT> perf_results(kNUM_CUDNN_BWD_FILTER_ALGS);
size_t workspace_size_limit =
CalcWorkspaceLimitInBytes(UseFixedWorkspace());
auto workspace_handle = ctx.cudnn_workspace_handle();
if (platform::CudnnDataType<T>::type != CUDNN_DATA_HALF) {
size_t max_workspace_size =
GetMaxWorkspaceSize(args, workspace_size_limit);
VLOG(3) << "max_workspace_size=" << ToMegaBytes(max_workspace_size)
<< " MB";
auto cudnn_find_func = [&](void* workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithmEx(
args.handle, args.idesc.desc(), args.x->data<T>(),
args.odesc.desc(), args.o->data<T>(), args.cdesc.desc(),
args.wdesc.desc(), const_cast<T*>(args.w->data<T>()),
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
perf_results.data(), workspace_ptr, max_workspace_size));
};
workspace_handle.RunFuncSync(cudnn_find_func, max_workspace_size,
UseFixedWorkspace());
VLOG(4) << GetPerfResultString<PerfT>(
"[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
returned_algo_count, workspace_size_limit);
ChooseAlgoByWorkspace<PerfT, AlgoT>(perf_results, workspace_size_limit,
&result);
} else { } else {
auto& dev_ctx = ctx; int max_algos = GetAlgorithmMaxCount(args.handle);
auto workspace_handle = dev_ctx.cudnn_workspace_handle(); std::vector<PerfT> perf_results(max_algos);
AlgorithmsCache<algo_t>& algo_cache = PADDLE_ENFORCE_GPU_SUCCESS(
*(framework::ConvSearchCache::Instance().GetBackwardFilter()); platform::dynload::cudnnFindConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
auto x_dims = phi::vectorize(args.x->dims()); args.cdesc.desc(), args.wdesc.desc(), perf_results.size(),
auto w_dims = phi::vectorize(args.w->dims()); &returned_algo_count, perf_results.data()));
perf_results.resize(returned_algo_count);
VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t:"
<< ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" VLOG(4) << GetPerfResultString<PerfT>(
<< args.s << ", args.p" << args.p << ", args.d" << args.d; "[Exhaustive Search] BwdFilterAlgo Perf result", perf_results,
if (dtype != CUDNN_DATA_HALF) { perf_results.size(), workspace_size_limit);
algo = algo_cache.GetAlgorithm( ChooseAlgo(perf_results, workspace_size_limit, &result);
x_dims, w_dims, args.s, args.p, args.d, 0, }
static_cast<int64_t>(args.cudnn_dtype), [&]() {
int returned_algo_count; return result;
std::array<perf_t, kNUM_CUDNN_BWD_FILTER_ALGS> perf_stat; }
auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
PADDLE_ENFORCE_GPU_SUCCESS( static int GetAlgorithmMaxCount(cudnnHandle_t handle) {
platform::dynload:: #if CUDNN_VERSION_MIN(7, 0, 1)
cudnnFindConvolutionBackwardFilterAlgorithmEx( int max_algos = 0;
args.handle, args.idesc.desc(), args.x->data<T>(), auto status =
args.odesc.desc(), args.o->data<T>(), platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
args.cdesc.desc(), args.wdesc.desc(), handle, &max_algos);
const_cast<T*>(args.w->data<T>()), if (status == gpuSuccess) {
kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, VLOG(5) << "[BackwardFilter] max_algos: predefined="
perf_stat.data(), cudnn_workspace_ptr, << kNUM_CUDNN_BWD_FILTER_ALGS << ", actual=" << max_algos;
workspace_size_limit)); return max_algos;
}; }
workspace_handle.RunFuncSync(cudnn_find_func, #endif
workspace_size_limit); return kNUM_CUDNN_BWD_FILTER_ALGS;
}
VLOG(3)
<< "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; static size_t GetMaxWorkspaceSize(const ConvArgs& args,
for (int i = 0; i < returned_algo_count; ++i) { size_t workspace_size_limit) {
const auto& stat = perf_stat[i]; if (!UseFixedWorkspace()) {
VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time size_t max_workspace_size = 0;
<< " " << stat.memory; for (size_t algo = 0; algo < kNUM_CUDNN_BWD_FILTER_ALGS; ++algo) {
} size_t workspace_size = 0;
return perf_stat[0].algo; auto status =
}); platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
} else { args.handle, args.idesc.desc(), args.odesc.desc(),
auto max_algos = MaxBwdFilterAlgos(args.handle); args.cdesc.desc(), args.wdesc.desc(),
algo = algo_cache.GetAlgorithm( static_cast<cudnnConvolutionBwdFilterAlgo_t>(algo),
x_dims, w_dims, args.s, args.p, args.d, 0, &workspace_size);
static_cast<int64_t>(args.cudnn_dtype), [&]() { if (status == CUDNN_STATUS_SUCCESS &&
algo_t chosen_algo; workspace_size <= workspace_size_limit) {
std::vector<perf_t> perf_results(max_algos); max_workspace_size = std::max(workspace_size, max_workspace_size);
int actual_algos = 0; }
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::
cudnnFindConvolutionBackwardFilterAlgorithm(
args.handle, args.idesc.desc(), args.odesc.desc(),
args.cdesc.desc(), args.wdesc.desc(),
perf_results.size(), &actual_algos,
perf_results.data()));
perf_results.resize(actual_algos);
ChooseAlgo<perf_t, algo_t>(perf_results, workspace_size_limit,
&chosen_algo);
return chosen_algo;
});
} }
return max_workspace_size;
} else {
return workspace_size_limit;
} }
VLOG(3) << "choose algo " << algo;
return algo;
} }
static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { static void ChooseAlgo(const std::vector<PerfT>& perf_results,
platform::CUDAGraphCaptureModeGuard guard; size_t workspace_limit,
size_t workspace_size = 0; SearchResult<AlgoT>* algo_result) {
PADDLE_ENFORCE_GPU_SUCCESS( for (size_t i = 0; i != perf_results.size(); ++i) {
platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( const auto& result = perf_results[i];
args.handle, args.idesc.desc(), args.odesc.desc(), if (result.status == CUDNN_STATUS_SUCCESS &&
args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size)); (result.memory <= workspace_limit)) {
return workspace_size; if ((result.mathType == CUDNN_TENSOR_OP_MATH) &&
(i != perf_results.size() - 1)) {
const auto& next_result = perf_results[i + 1];
if (next_result.status == CUDNN_STATUS_SUCCESS &&
next_result.algo == result.algo &&
next_result.memory == result.memory &&
next_result.mathType != CUDNN_TENSOR_OP_MATH &&
next_result.time < 1.01 * result.time) {
// Skip over this result- it's not really a Tensor Core algo.
// Because it is only 1% performance difference.
// Prefer to choose the next equivalent non-Tensor Core algo.
continue;
}
}
algo_result->algo = result.algo;
algo_result->time = result.time;
auto math_type_str = "0";
if (result.mathType == CUDNN_TENSOR_OP_MATH) {
math_type_str = "1";
}
VLOG(3) << " choose algo: " << result.algo
<< ", TC: " << math_type_str << ", time: " << result.time
<< " ms, wksp = " << result.memory
<< ", status = " << result.status;
break;
}
}
} }
}; };
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_uint64(conv_workspace_size_limit); DECLARE_int64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search); DECLARE_bool(cudnn_exhaustive_search);
DECLARE_int64(cudnn_exhaustive_search_times); DECLARE_int64(cudnn_exhaustive_search_times);
......
...@@ -14,42 +14,12 @@ limitations under the License. */ ...@@ -14,42 +14,12 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include "paddle/fluid/operators/conv_base_helper.h"
#include <array>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using ConvArgs = ConvArgsBase<miopenHandle_t, miopenDataType_t>;
using DataLayout = platform::DataLayout;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
using framework::AlgorithmsCache;
static inline void GetNCDHW(const framework::DDim& dims,
const DataLayout& layout, int* N, int* C, int* D,
int* H, int* W) {
*N = dims[0];
*C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
int i = layout == DataLayout::kNCHW ? 0 : 1;
if (dims.size() == 5) {
*D = dims[2 - i];
*H = dims[3 - i];
*W = dims[4 - i];
} else {
*D = 1;
*H = dims[2 - i];
*W = dims[3 - i];
}
}
template <typename DeviceContext, typename T, size_t D> template <typename DeviceContext, typename T, size_t D>
static void RemovePaddingSlice(const phi::GPUContext& context, static void RemovePaddingSlice(const phi::GPUContext& context,
...@@ -66,9 +36,8 @@ static void RemovePaddingSlice(const phi::GPUContext& context, ...@@ -66,9 +36,8 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
extents[i] = new_out_dims[i]; extents[i] = new_out_dims[i];
} }
int start;
for (size_t i = 0; i < axes.size(); ++i) { for (size_t i = 0; i < axes.size(); ++i) {
start = starts[i]; int start = starts[i];
if (start < 0) { if (start < 0) {
start = (start + in_dims[axes[i]]); start = (start + in_dims[axes[i]]);
} }
...@@ -85,41 +54,6 @@ static void RemovePaddingSlice(const phi::GPUContext& context, ...@@ -85,41 +54,6 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
out_t.device(place) = in_t.slice(offsets, extents); out_t.device(place) = in_t.slice(offsets, extents);
} }
template <typename T>
std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
out << "[";
for (auto const& tmp : v) out << tmp << ",";
out << "]";
return out;
}
using framework::ConvSearchCache;
struct ConvArgs {
miopenHandle_t handle;
platform::TensorDescriptor idesc, odesc;
platform::FilterDescriptor wdesc;
platform::ConvolutionDescriptor cdesc;
const framework::Tensor *x, *w, *o;
miopenDataType_t cudnn_dtype;
// strides
std::vector<int> s;
// paddings
std::vector<int> p;
// dilations
std::vector<int> d;
ConvArgs(const framework::Tensor* x, const framework::Tensor* w,
const framework::Tensor* o, const std::vector<int> s,
const std::vector<int> p, const std::vector<int> d,
miopenDataType_t dtype)
: x(x), w(w), o(o), s(s), p(p), d(d), cudnn_dtype(dtype) {}
};
template <typename algo_t>
struct SearchAlgorithm {};
template <> template <>
struct SearchAlgorithm<miopenConvFwdAlgorithm_t> { struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
using perf_t = miopenConvAlgoPerf_t; using perf_t = miopenConvAlgoPerf_t;
......
...@@ -16,8 +16,6 @@ limitations under the License. */ ...@@ -16,8 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_uint64(conv_workspace_size_limit);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -188,6 +188,8 @@ class RecordedGpuMallocHelper { ...@@ -188,6 +188,8 @@ class RecordedGpuMallocHelper {
if (UNLIKELY(malloc_managed_memory)) { if (UNLIKELY(malloc_managed_memory)) {
result = cudaMallocManaged(ptr, size); result = cudaMallocManaged(ptr, size);
} else { } else {
VLOG(10) << "[cudaMalloc] size=" << static_cast<double>(size) / (1 << 20)
<< " MB";
result = cudaMalloc(ptr, size); result = cudaMalloc(ptr, size);
} }
#endif #endif
...@@ -226,6 +228,8 @@ class RecordedGpuMallocHelper { ...@@ -226,6 +228,8 @@ class RecordedGpuMallocHelper {
if (err != hipErrorDeinitialized) { if (err != hipErrorDeinitialized) {
#else #else
auto err = cudaFree(ptr); auto err = cudaFree(ptr);
VLOG(10) << "[cudaFree] size=" << static_cast<double>(size) / (1 << 20)
<< " MB";
if (err != cudaErrorCudartUnloading) { if (err != cudaErrorCudartUnloading) {
#endif #endif
PADDLE_ENFORCE_GPU_SUCCESS(err); PADDLE_ENFORCE_GPU_SUCCESS(err);
......
...@@ -522,8 +522,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) { ...@@ -522,8 +522,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place)); cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
auto& instance = memory::allocation::AllocatorFacade::Instance(); auto& instance = memory::allocation::AllocatorFacade::Instance();
instance.SetDefaultStream(place, phi::GPUContext::stream()); instance.SetDefaultStream(place, phi::GPUContext::stream());
workspace_.reset( workspace_.reset(new phi::DnnWorkspaceHandle(
new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get())); instance.GetAllocator(place).get(), stream()));
} }
CUDADeviceContext::~CUDADeviceContext() = default; CUDADeviceContext::~CUDADeviceContext() = default;
...@@ -623,7 +623,8 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { ...@@ -623,7 +623,8 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return phi::DnnWorkspaceHandle( return phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance() memory::allocation::AllocatorFacade::Instance()
.GetAllocator(GetPlace()) .GetAllocator(GetPlace())
.get()); .get(),
stream());
} }
return phi::GPUContext::cudnn_workspace_handle(); return phi::GPUContext::cudnn_workspace_handle();
} }
......
...@@ -158,10 +158,9 @@ PADDLE_DEFINE_EXPORTED_bool( ...@@ -158,10 +158,9 @@ PADDLE_DEFINE_EXPORTED_bool(
* increased. * increased.
* Users need to balance memory and speed. * Users need to balance memory and speed.
*/ */
PADDLE_DEFINE_EXPORTED_uint64( PADDLE_DEFINE_EXPORTED_int64(conv_workspace_size_limit,
conv_workspace_size_limit, paddle::platform::kDefaultConvWorkspaceSizeLimitMB,
paddle::platform::kDefaultConvWorkspaceSizeLimitMB, "cuDNN convolution workspace limit in MB unit.");
"cuDNN convolution workspace limit in MB unit.");
/** /**
* CUDNN related FLAG * CUDNN related FLAG
...@@ -800,3 +799,12 @@ DEFINE_bool(enable_ins_parser_file, false, ...@@ -800,3 +799,12 @@ DEFINE_bool(enable_ins_parser_file, false,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
#endif #endif
/**
* Autotune related FLAG
* Name: FLAGS_use_autotune
* Since Version: 2.3.0
* Value Range: bool, default=false
* Example:
*/
PADDLE_DEFINE_EXPORTED_bool(use_autotune, false, "Whether enable autotune.");
...@@ -4430,7 +4430,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4430,7 +4430,7 @@ All parameter, weight, gradient are variables in Paddle.
return phi::autotune::AutoTuneStatus::Instance().DisableAutoTune(); return phi::autotune::AutoTuneStatus::Instance().DisableAutoTune();
}); });
m.def("autotune_range", [](int64_t start, int64_t stop) { m.def("set_autotune_range", [](int64_t start, int64_t stop) {
return phi::autotune::AutoTuneStatus::Instance().SetAutoTuneRange(start, return phi::autotune::AutoTuneStatus::Instance().SetAutoTuneRange(start,
stop); stop);
}); });
...@@ -4439,10 +4439,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4439,10 +4439,8 @@ All parameter, weight, gradient are variables in Paddle.
[] { return phi::autotune::AutoTuneStatus::Instance().Update(); }); [] { return phi::autotune::AutoTuneStatus::Instance().Update(); });
m.def("autotune_status", [] { m.def("autotune_status", [] {
phi::autotune::AutoTuneCache::Instance().UpdateStatus();
py::dict res; py::dict res;
res["use_autotune"] = phi::autotune::AutoTuneCache::Instance().UpdateStatus();
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
res["step_id"] = phi::autotune::AutoTuneStatus::Instance().StepID(); res["step_id"] = phi::autotune::AutoTuneStatus::Instance().StepID();
res["cache_size"] = phi::autotune::AutoTuneCache::Instance().Size(); res["cache_size"] = phi::autotune::AutoTuneCache::Instance().Size();
res["cache_hit_rate"] = res["cache_hit_rate"] =
......
...@@ -12,6 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -12,6 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include <algorithm> #include <algorithm>
#include <array> #include <array>
...@@ -155,6 +156,39 @@ static void StreamCallbackFunc(gpuStream_t stream, ...@@ -155,6 +156,39 @@ static void StreamCallbackFunc(gpuStream_t stream,
} // namespace internal } // namespace internal
void DnnWorkspaceHandle::RunFuncSync(
const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes,
bool use_cached_allocation) {
bool need_realloc = required_workspace_bytes > WorkspaceSize();
if (need_realloc && !use_cached_allocation) {
void* workspace_ptr = nullptr;
size_t size = ((required_workspace_bytes + 255) >> 8) << 8;
std::lock_guard<std::mutex> guard(*mtx_);
#ifdef PADDLE_WITH_HIP
auto status = hipMalloc(&workspace_ptr, size);
#else
auto status = cudaMalloc(&workspace_ptr, size);
#endif
if (status == gpuSuccess) {
cudnn_func(workspace_ptr);
phi::backends::gpu::GpuStreamSync(stream_);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
#endif
return;
}
}
RunFunc(cudnn_func, required_workspace_bytes);
if (need_realloc) {
// Release the workspace allocated in this running.
ResetWorkspace();
}
}
void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; } void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; }
void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
...@@ -295,13 +329,13 @@ struct GPUContext::Impl { ...@@ -295,13 +329,13 @@ struct GPUContext::Impl {
void InitDnnWorkspace() { void InitDnnWorkspace() {
PD_CHECK(allocator_ != nullptr, PD_CHECK(allocator_ != nullptr,
"the device allocator for gpu context is nullptr."); "the device allocator for gpu context is nullptr.");
workspace_ = new DnnWorkspaceHandle(allocator_); workspace_ = new DnnWorkspaceHandle(allocator_, stream_);
} }
void DestoryInternalWorkspace() { void DestoryInternalWorkspace() {
if (owned_ && workspace_ != nullptr) { if (owned_ && workspace_ != nullptr) {
delete workspace_; delete workspace_;
stream_ = nullptr; workspace_ = nullptr;
} }
} }
...@@ -313,7 +347,7 @@ struct GPUContext::Impl { ...@@ -313,7 +347,7 @@ struct GPUContext::Impl {
DnnWorkspaceHandle GetDnnWorkspace() { DnnWorkspaceHandle GetDnnWorkspace() {
PD_CHECK(allocator_ != nullptr, PD_CHECK(allocator_ != nullptr,
"the device allocator for gpu context is nullptr."); "the device allocator for gpu context is nullptr.");
return DnnWorkspaceHandle(allocator_); return DnnWorkspaceHandle(allocator_, stream_);
} }
void InitStream() { void InitStream() {
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/forwards.h" #include "paddle/phi/backends/gpu/forwards.h"
#include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_helper.h" #include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/device_context.h"
...@@ -28,8 +29,8 @@ namespace phi { ...@@ -28,8 +29,8 @@ namespace phi {
class DnnWorkspaceHandle { class DnnWorkspaceHandle {
public: public:
explicit inline DnnWorkspaceHandle(Allocator* allocator) inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)
: allocator_(allocator) { : allocator_(allocator), stream_(stream) {
mtx_.reset(new std::mutex()); mtx_.reset(new std::mutex());
} }
...@@ -48,11 +49,9 @@ class DnnWorkspaceHandle { ...@@ -48,11 +49,9 @@ class DnnWorkspaceHandle {
* running the function. Currently this function is only used when cudnn * running the function. Currently this function is only used when cudnn
* exhaustive searching and callers have to guarantee that the input function * exhaustive searching and callers have to guarantee that the input function
* is host blocking */ * is host blocking */
inline void RunFuncSync(const std::function<void(void*)>& cudnn_func, void RunFuncSync(const std::function<void(void*)>& cudnn_func,
size_t required_workspace_bytes) { size_t required_workspace_bytes,
RunFunc(cudnn_func, required_workspace_bytes); bool use_cached_allocation = true);
ResetWorkspace();
}
inline size_t WorkspaceSize() { inline size_t WorkspaceSize() {
if (allocation_ == nullptr) { if (allocation_ == nullptr) {
...@@ -70,7 +69,8 @@ class DnnWorkspaceHandle { ...@@ -70,7 +69,8 @@ class DnnWorkspaceHandle {
private: private:
Allocator::AllocationPtr allocation_{nullptr}; Allocator::AllocationPtr allocation_{nullptr};
Allocator* allocator_{nullptr}; Allocator* allocator_{nullptr}; // Not owned
gpuStream_t stream_{nullptr}; // Not owned
std::unique_ptr<std::mutex> mtx_; std::unique_ptr<std::mutex> mtx_;
}; };
......
...@@ -6,12 +6,15 @@ file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h ...@@ -6,12 +6,15 @@ file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h
# phi functors and functions called by kernels # phi functors and functions called by kernels
add_subdirectory(funcs) add_subdirectory(funcs)
# kernel autotune
add_subdirectory(autotune)
# phi depends all phi kernel targets # phi depends all phi kernel targets
set_property(GLOBAL PROPERTY PHI_KERNELS "") set_property(GLOBAL PROPERTY PHI_KERNELS "")
# [ 1. Common kernel compilation dependencies ] # [ 1. Common kernel compilation dependencies ]
set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor ) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor)
# remove this dep after removing fluid deps on tensor creation # remove this dep after removing fluid deps on tensor creation
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta)
...@@ -27,13 +30,17 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) ...@@ -27,13 +30,17 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
# Some kernels depend on some targets that are not commonly used. # Some kernels depend on some targets that are not commonly used.
# These targets are not suitable for common dependencies. # These targets are not suitable for common dependencies.
# In this case, you need to manually generate them here. # In this case, you need to manually generate them here.
set(MANUAL_BUILD_KERNELS cross_entropy_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel set(AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel conv_transpose_kernel conv_transpose_grad_kernel)
set(MANUAL_BUILD_KERNELS ${AUTOTUNE_KERNELS} cross_entropy_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel
gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
triangular_solve_grad_kernel determinant_grad_kernel reduce_sum_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) triangular_solve_grad_kernel determinant_grad_kernel reduce_sum_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel)
foreach(src ${AUTOTUNE_KERNELS})
kernel_library(${src} DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
endforeach()
kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy) kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy)
kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
...@@ -74,6 +81,3 @@ add_subdirectory(selected_rows) ...@@ -74,6 +81,3 @@ add_subdirectory(selected_rows)
copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
# For strings kernels # For strings kernels
add_subdirectory(strings) add_subdirectory(strings)
# 5. kernel autotune
add_subdirectory(autotune)
if (WITH_GPU) if (WITH_GPU)
nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) nv_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest) hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest) hip_test(auto_tune_test SRCS auto_tune_test.cu DEPS gtest)
endif() endif()
cc_library(cache SRCS cache.cc DEPS boost) cc_library(cache SRCS cache.cc DEPS boost)
cc_library(switch_autotune SRCS switch_autotune.cc DEPS cache flags)
cc_test(cache_test SRCS cache_test.cc DEPS gtest cache) cc_test(cache_test SRCS cache_test.cc DEPS gtest cache)
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/cache.h"
#include <iomanip>
#include "glog/logging.h"
namespace phi { namespace phi {
namespace autotune { namespace autotune {
...@@ -32,5 +34,40 @@ size_t ConvKey(const std::vector<int64_t>& x_dims, ...@@ -32,5 +34,40 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
static_cast<int64_t>(dtype)); static_cast<int64_t>(dtype));
} }
std::string AlgorithmTypeString(int64_t algo_type) {
if (algo_type == static_cast<int64_t>(AlgorithmType::kConvForward)) {
return "conv_forward";
} else if (algo_type ==
static_cast<int64_t>(AlgorithmType::kConvBackwardData)) {
return "conv_backward_data";
} else if (algo_type ==
static_cast<int64_t>(AlgorithmType::kConvBackwardFilter)) {
return "conv_backward_filter";
}
return std::to_string(algo_type);
}
void AutoTuneCache::UpdateStatus() {
int64_t size = 0;
int64_t cache_hits = 0;
int64_t cache_misses = 0;
int name_width = 24;
std::cout.setf(std::ios::left);
for (auto& v : auto_tune_map_) {
VLOG(4) << "AlgoType: " << std::setfill(' ') << std::setw(name_width)
<< AlgorithmTypeString(v.first)
<< " Cache Size: " << v.second.Size()
<< " Hits: " << v.second.CacheHits()
<< " Misses: " << v.second.CacheMisses()
<< " Hit Rate: " << v.second.CacheHitRate();
size += v.second.Size();
cache_hits += v.second.CacheHits();
cache_misses += v.second.CacheMisses();
}
total_size_ = size;
total_cache_hits_ = cache_hits;
total_cache_misses_ = cache_misses;
}
} // namespace autotune } // namespace autotune
} // namespace phi } // namespace phi
...@@ -13,11 +13,12 @@ ...@@ -13,11 +13,12 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <mutex> #include <mutex>
#include <numeric>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "glog/logging.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/enforce.h" #include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h" #include "paddle/phi/core/errors.h"
...@@ -92,6 +93,13 @@ class AlgorithmsCache { ...@@ -92,6 +93,13 @@ class AlgorithmsCache {
return ret; return ret;
} }
void Clean() {
std::lock_guard<std::mutex> lock(*cache_mutex_);
hash_.clear();
cache_hits_ = 0;
cache_misses_ = 0;
}
void Set(size_t key, AlgorithmT algo) { void Set(size_t key, AlgorithmT algo) {
std::lock_guard<std::mutex> lock(*cache_mutex_); std::lock_guard<std::mutex> lock(*cache_mutex_);
hash_[key] = algo; hash_[key] = algo;
...@@ -116,15 +124,22 @@ class AlgorithmsCache { ...@@ -116,15 +124,22 @@ class AlgorithmsCache {
private: private:
std::unordered_map<size_t, AlgorithmT> hash_; std::unordered_map<size_t, AlgorithmT> hash_;
std::shared_ptr<std::mutex> cache_mutex_; std::shared_ptr<std::mutex> cache_mutex_;
int64_t cache_hits_ = 0;
int64_t cache_misses_ = 0; int64_t cache_hits_{0};
int64_t cache_misses_{0};
};
enum class AlgorithmType {
kConvForward = 1,
kConvBackwardData = 2,
kConvBackwardFilter = 3,
kAlgorithmCount = 4
}; };
// AlgorithmsConfigKey -> AlgorithmsID // AlgorithmsConfigKey -> AlgorithmsID
using AlgorithmsConfigKeyMap = AlgorithmsCache<int64_t>; using AlgorithmsCacheMap = AlgorithmsCache<int64_t>;
// AlgorithmsType -> AlgorithmsCache // AlgorithmType -> AlgorithmsCache
using AlgorithmsTypeMap = using AlgorithmsTypeMap = std::unordered_map<int64_t, AlgorithmsCacheMap>;
std::unordered_map<std::string, AlgorithmsConfigKeyMap>;
class AutoTuneCache { class AutoTuneCache {
public: public:
...@@ -133,42 +148,30 @@ class AutoTuneCache { ...@@ -133,42 +148,30 @@ class AutoTuneCache {
return autotune_cache; return autotune_cache;
} }
AlgorithmsConfigKeyMap& RegisterOrGet(const std::string& algo_type) { AlgorithmsCacheMap& Get(const AlgorithmType& algo_type) {
std::lock_guard<std::mutex> lock(*autotune_cache_mutex_); return auto_tune_map_[static_cast<int64_t>(algo_type)];
if (auto_tune_map_.find(algo_type) == auto_tune_map_.end()) {
AlgorithmsConfigKeyMap cache;
auto_tune_map_[algo_type] = cache;
}
return auto_tune_map_[algo_type];
} }
void Clean(float miss_rate) { AlgorithmsCacheMap& GetConvForward() {
std::lock_guard<std::mutex> lock(*autotune_cache_mutex_); return Get(AlgorithmType::kConvForward);
// Set a small tolerance to avoid performance degradation }
// due to large cache size under dynamic shape.
if (miss_rate > 0.01) { AlgorithmsCacheMap& GetConvBackwardData() {
auto_tune_map_.clear(); return Get(AlgorithmType::kConvBackwardData);
} }
AlgorithmsCacheMap& GetConvBackwardFilter() {
return Get(AlgorithmType::kConvBackwardFilter);
} }
void UpdateStatus() { void Clean() {
int64_t size = 0;
int64_t cache_hits = 0;
int64_t cache_misses = 0;
for (auto& v : auto_tune_map_) { for (auto& v : auto_tune_map_) {
VLOG(4) << "AlgoType: " << v.first << " Cache Size: " << v.second.Size() v.second.Clean();
<< " Hits: " << v.second.CacheHits()
<< " Misses: " << v.second.CacheMisses()
<< " Hit Rate: " << v.second.CacheHitRate();
size += v.second.Size();
cache_hits += v.second.CacheHits();
cache_misses += v.second.CacheMisses();
} }
total_size_ = size;
total_cache_hits_ = cache_hits;
total_cache_misses_ = cache_misses;
} }
void UpdateStatus();
// The number of total config cached // The number of total config cached
int64_t Size() const { return total_size_; } int64_t Size() const { return total_size_; }
...@@ -183,17 +186,30 @@ class AutoTuneCache { ...@@ -183,17 +186,30 @@ class AutoTuneCache {
total_cache_hit_rate = static_cast<float>(total_cache_hits_) / total_cache_hit_rate = static_cast<float>(total_cache_hits_) /
static_cast<float>(total_num_accesses); static_cast<float>(total_num_accesses);
} }
return total_cache_hit_rate; return total_cache_hit_rate;
} }
private: private:
AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) {} AutoTuneCache() : autotune_cache_mutex_(new std::mutex()) {
for (int i = 1; i < static_cast<int>(AlgorithmType::kAlgorithmCount); ++i) {
Register(static_cast<AlgorithmType>(i));
}
}
void Register(const AlgorithmType& algo_type) {
std::lock_guard<std::mutex> lock(*autotune_cache_mutex_);
int64_t key = static_cast<int64_t>(algo_type);
if (auto_tune_map_.find(key) == auto_tune_map_.end()) {
AlgorithmsCacheMap cache;
auto_tune_map_[key] = cache;
}
}
AlgorithmsTypeMap auto_tune_map_; AlgorithmsTypeMap auto_tune_map_;
std::shared_ptr<std::mutex> autotune_cache_mutex_; std::shared_ptr<std::mutex> autotune_cache_mutex_;
int64_t total_cache_hits_ = 0; int64_t total_cache_hits_{0};
int64_t total_cache_misses_ = 0; int64_t total_cache_misses_{0};
int64_t total_size_ = 0; int64_t total_size_{0};
}; };
} // namespace autotune } // namespace autotune
......
...@@ -22,7 +22,7 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 }; ...@@ -22,7 +22,7 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
TEST(AlgosCache, AlgosCache) { TEST(AlgosCache, AlgosCache) {
auto autotune_cache = phi::autotune::AutoTuneCache::Instance(); auto autotune_cache = phi::autotune::AutoTuneCache::Instance();
auto& cache = autotune_cache.RegisterOrGet("conv_fw"); auto& cache = autotune_cache.GetConvForward();
std::vector<int64_t> x_shape = {4, 224, 224, 3}; std::vector<int64_t> x_shape = {4, 224, 224, 3};
std::vector<int64_t> w_shape = {32, 3, 3, 3}; std::vector<int64_t> w_shape = {32, 3, 3, 3};
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DECLARE_bool(use_autotune);
namespace phi {
namespace autotune {
void AutoTuneStatus::EnableAutoTune() {
FLAGS_use_autotune = true;
Init();
}
void AutoTuneStatus::DisableAutoTune() {
FLAGS_use_autotune = false;
Init();
}
void AutoTuneStatus::Update() {
current_steps_id_ += 1;
if (!FLAGS_use_autotune) {
return;
}
// This fuction is called when each iter finished.
if (current_steps_id_ + 1 < start_step_id_) {
use_autotune_ = false;
} else if (current_steps_id_ + 1 >= start_step_id_ &&
current_steps_id_ + 1 < stop_step_id_) {
use_autotune_ = true;
AutoTuneCache::Instance().UpdateStatus();
step_hit_rates_.push_back(StepHitRate());
VLOG(3) << "Step ID: " << current_steps_id_
<< ", Accumulative Cache Hit Rate: "
<< static_cast<int>(AutoTuneCache::Instance().CacheHitRate() * 100)
<< "%, Cache Size: " << AutoTuneCache::Instance().Size()
<< ", Current Step Hit Rate: "
<< static_cast<int>(StepHitRate() * 100) << "%";
} else {
use_autotune_ = false;
// Set a small tolerance to avoid performance degradation
// due to large cache size under dynamic shape.
// TODO(limingshu): Currently works for conv op only, this
// method shall be opimized when more ops involved in.
// float miss_rate = static_cast<float>(1) - RecentHitRate();
// if (current_steps_id_ == stop_step_id_) {
// AutoTuneCache::Instance().Clean(miss_rate);
// }
if (VLOG_IS_ON(4)) {
AutoTuneCache::Instance().UpdateStatus();
VLOG(4) << "Step ID: " << current_steps_id_ << ", Current Step Hit Rate: "
<< static_cast<int>(StepHitRate() * 100) << "%";
}
}
}
} // namespace autotune
} // namespace phi
...@@ -13,10 +13,8 @@ ...@@ -13,10 +13,8 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <cmath> #include <cmath>
#include <mutex>
#include <numeric>
#include "glog/logging.h"
#include "paddle/phi/kernels/autotune/cache.h" #include "paddle/phi/kernels/autotune/cache.h"
namespace phi { namespace phi {
...@@ -31,45 +29,11 @@ class AutoTuneStatus { ...@@ -31,45 +29,11 @@ class AutoTuneStatus {
bool UseAutoTune() { return use_autotune_; } bool UseAutoTune() { return use_autotune_; }
// EnableAutoTune and DisableAutoTune Should be used for debug only. // EnableAutoTune and DisableAutoTune should be used for debug only.
void EnableAutoTune() { void EnableAutoTune();
use_autotune_ = true; void DisableAutoTune();
Init();
}
void DisableAutoTune() {
use_autotune_ = false;
Init();
}
void Update() { void Update();
current_steps_id_ += 1;
if (!use_autotune_ && !update_use_autotune_) {
return;
}
if (current_steps_id_ < start_step_id_) {
use_autotune_ = false;
} else if (current_steps_id_ >= start_step_id_ &&
current_steps_id_ < stop_step_id_) {
use_autotune_ = true;
AutoTuneCache::Instance().UpdateStatus();
step_hit_rates_.push_back(StepHitRate());
VLOG(3) << "Step ID " << current_steps_id_
<< ", Accumulative Cache Hit Rate: "
<< AutoTuneCache::Instance().CacheHitRate()
<< ", Cache Size: " << AutoTuneCache::Instance().Size()
<< ", Current Step Hit Rate: " << StepHitRate();
} else if (current_steps_id_ == stop_step_id_) {
use_autotune_ = false;
update_use_autotune_ = false;
// clean cache according miss rate
float miss_rate = static_cast<float>(1) - RecentHitRate();
AutoTuneCache::Instance().Clean(miss_rate);
VLOG(3) << "Recent Miss Rate: " << miss_rate;
}
}
int64_t StepID() { return current_steps_id_; } int64_t StepID() { return current_steps_id_; }
...@@ -84,19 +48,25 @@ class AutoTuneStatus { ...@@ -84,19 +48,25 @@ class AutoTuneStatus {
// Hit Rate of Current Step // Hit Rate of Current Step
float StepHitRate() { float StepHitRate() {
int64_t current_hits = AutoTuneCache::Instance().CacheHits(); static int64_t last_step_id = -2;
int64_t current_misses = AutoTuneCache::Instance().CacheMisses();
int64_t step_hits_ = current_hits - previous_hits_; if (last_step_id != current_steps_id_) {
int64_t step_misses_ = current_misses - previous_misses_; int64_t current_hits = AutoTuneCache::Instance().CacheHits();
float step_hit_rate = 0.; int64_t current_misses = AutoTuneCache::Instance().CacheMisses();
int64_t step_num_accesses = step_hits_ + step_misses_; int64_t step_hits_ = current_hits - previous_hits_;
if (step_num_accesses != 0) { int64_t step_misses_ = current_misses - previous_misses_;
step_hit_rate = static_cast<float>(step_hits_) / float step_hit_rate = 0.;
static_cast<float>(step_num_accesses); int64_t step_num_accesses = step_hits_ + step_misses_;
if (step_num_accesses != 0) {
step_hit_rate = static_cast<float>(step_hits_) /
static_cast<float>(step_num_accesses);
}
previous_hits_ = current_hits;
previous_misses_ = current_misses;
current_step_hit_rate_ = step_hit_rate;
last_step_id = current_steps_id_;
} }
previous_hits_ = current_hits; return current_step_hit_rate_;
previous_misses_ = current_misses;
return step_hit_rate;
} }
void SetAutoTuneRange(int64_t start, int64_t stop) { void SetAutoTuneRange(int64_t start, int64_t stop) {
...@@ -108,21 +78,21 @@ class AutoTuneStatus { ...@@ -108,21 +78,21 @@ class AutoTuneStatus {
AutoTuneStatus() = default; AutoTuneStatus() = default;
void Init() { void Init() {
update_use_autotune_ = use_autotune_; use_autotune_ = false;
current_steps_id_ = -1; current_steps_id_ = -1;
previous_hits_ = 0; previous_hits_ = 0;
previous_misses_ = 0; previous_misses_ = 0;
step_hit_rates_.clear(); step_hit_rates_.clear();
AutoTuneCache::Instance().Clean(1.0); AutoTuneCache::Instance().Clean();
} }
int64_t start_step_id_ = 0; bool use_autotune_{false};
int64_t stop_step_id_ = 10; int64_t start_step_id_{1};
int64_t current_steps_id_ = -1; int64_t stop_step_id_{10};
bool use_autotune_ = false; int64_t current_steps_id_{-1};
bool update_use_autotune_ = false; int64_t previous_hits_{0};
int64_t previous_hits_ = 0; int64_t previous_misses_{0};
int64_t previous_misses_ = 0; float current_step_hit_rate_{0.f};
std::vector<float> step_hit_rates_; std::vector<float> step_hit_rates_;
}; };
......
...@@ -289,21 +289,17 @@ void ConvCudnnGradGradKernel( ...@@ -289,21 +289,17 @@ void ConvCudnnGradGradKernel(
dtype}; dtype};
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t fwd_algo1 = static_cast<miopenConvFwdAlgorithm_t>(0); paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result1;
miopenConvFwdAlgorithm_t fwd_algo2 = static_cast<miopenConvFwdAlgorithm_t>(0); paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result2;
miopenConvBwdDataAlgorithm_t data_algo = paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> data_result;
static_cast<miopenConvBwdDataAlgorithm_t>(0); paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
miopenConvBwdWeightsAlgorithm_t filter_algo = filter_result;
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else #else
cudnnConvolutionFwdAlgo_t fwd_algo1 = paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result1;
static_cast<cudnnConvolutionFwdAlgo_t>(0); paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result2;
cudnnConvolutionFwdAlgo_t fwd_algo2 = paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> data_result;
static_cast<cudnnConvolutionFwdAlgo_t>(0); paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
cudnnConvolutionBwdDataAlgo_t data_algo = filter_result;
static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif #endif
auto layout = paddle::platform::GetCudnnTensorFormat( auto layout = paddle::platform::GetCudnnTensorFormat(
...@@ -332,13 +328,13 @@ void ConvCudnnGradGradKernel( ...@@ -332,13 +328,13 @@ void ConvCudnnGradGradKernel(
using search1 = using search1 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1); workspace_size = search1::GetWorkspaceSize(args1);
fwd_algo1 = search1::Find<T>( fwd_result1.algo = search1::Find<T>(
args1, exhaustive_search, false, workspace_size, ctx); args1, exhaustive_search, false, workspace_size, ctx);
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx); fwd_result1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
#endif #endif
} }
...@@ -360,14 +356,14 @@ void ConvCudnnGradGradKernel( ...@@ -360,14 +356,14 @@ void ConvCudnnGradGradKernel(
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = workspace_size =
std::max(workspace_size, search2::GetWorkspaceSize(args2)); std::max(workspace_size, search2::GetWorkspaceSize(args2));
fwd_algo2 = search2::Find<T>( fwd_result2.algo = search2::Find<T>(
args2, exhaustive_search, false, workspace_size, ctx); args2, exhaustive_search, false, workspace_size, ctx);
#else #else
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx); fwd_result2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
workspace_size = workspace_size = std::max(
std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
#endif #endif
} }
} }
...@@ -389,15 +385,15 @@ void ConvCudnnGradGradKernel( ...@@ -389,15 +385,15 @@ void ConvCudnnGradGradKernel(
using search3 = using search3 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo = search3::Find<T>( filter_result.algo = search3::Find<T>(
args3, exhaustive_search, deterministic, workspace_size, ctx); args3, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search3 = using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = filter_result =
search3::Find<T>(args3, exhaustive_search, deterministic, ctx); search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
workspace_size = workspace_size = std::max(
std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
#endif #endif
} }
...@@ -419,14 +415,15 @@ void ConvCudnnGradGradKernel( ...@@ -419,14 +415,15 @@ void ConvCudnnGradGradKernel(
using search4 = using search4 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo = search4::Find<T>( data_result.algo = search4::Find<T>(
args4, exhaustive_search, deterministic, workspace_size, ctx); args4, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
using search4 = using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo = search4::Find<T>(args4, exhaustive_search, deterministic, ctx); data_result =
workspace_size = search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, data_result.algo));
#endif #endif
} }
...@@ -471,7 +468,7 @@ void ConvCudnnGradGradKernel( ...@@ -471,7 +468,7 @@ void ConvCudnnGradGradKernel(
args1.wdesc.desc(), args1.wdesc.desc(),
w, w,
args1.cdesc.desc(), args1.cdesc.desc(),
fwd_algo1, fwd_result1.algo,
&beta, &beta,
args1.odesc.desc(), args1.odesc.desc(),
transformed_ddy_channel, transformed_ddy_channel,
...@@ -492,7 +489,7 @@ void ConvCudnnGradGradKernel( ...@@ -492,7 +489,7 @@ void ConvCudnnGradGradKernel(
args1.wdesc.desc(), args1.wdesc.desc(),
w + i * group_offset_filter, w + i * group_offset_filter,
args1.cdesc.desc(), args1.cdesc.desc(),
fwd_algo1, fwd_result1.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&beta, &beta,
...@@ -517,7 +514,7 @@ void ConvCudnnGradGradKernel( ...@@ -517,7 +514,7 @@ void ConvCudnnGradGradKernel(
args2.wdesc.desc(), args2.wdesc.desc(),
ddw, ddw,
args2.cdesc.desc(), args2.cdesc.desc(),
fwd_algo2, fwd_result2.algo,
&beta, &beta,
args2.odesc.desc(), args2.odesc.desc(),
transformed_ddy_channel, transformed_ddy_channel,
...@@ -538,7 +535,7 @@ void ConvCudnnGradGradKernel( ...@@ -538,7 +535,7 @@ void ConvCudnnGradGradKernel(
args2.wdesc.desc(), args2.wdesc.desc(),
ddw + i * group_offset_filter, ddw + i * group_offset_filter,
args2.cdesc.desc(), args2.cdesc.desc(),
fwd_algo2, fwd_result2.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&alpha, &alpha,
...@@ -568,7 +565,7 @@ void ConvCudnnGradGradKernel( ...@@ -568,7 +565,7 @@ void ConvCudnnGradGradKernel(
args3.idesc.desc(), args3.idesc.desc(),
ddx, ddx,
args3.cdesc.desc(), args3.cdesc.desc(),
filter_algo, filter_result.algo,
&beta, &beta,
args3.wdesc.desc(), args3.wdesc.desc(),
dw, dw,
...@@ -589,7 +586,7 @@ void ConvCudnnGradGradKernel( ...@@ -589,7 +586,7 @@ void ConvCudnnGradGradKernel(
args3.odesc.desc(), args3.odesc.desc(),
transformed_dy_channel + i * group_offset_out, transformed_dy_channel + i * group_offset_out,
args3.cdesc.desc(), args3.cdesc.desc(),
filter_algo, filter_result.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&beta, &beta,
...@@ -615,7 +612,7 @@ void ConvCudnnGradGradKernel( ...@@ -615,7 +612,7 @@ void ConvCudnnGradGradKernel(
args4.wdesc.desc(), args4.wdesc.desc(),
ddw, ddw,
args4.cdesc.desc(), args4.cdesc.desc(),
data_algo, data_result.algo,
&beta, &beta,
args4.idesc.desc(), args4.idesc.desc(),
transformed_dx, transformed_dx,
...@@ -636,7 +633,7 @@ void ConvCudnnGradGradKernel( ...@@ -636,7 +633,7 @@ void ConvCudnnGradGradKernel(
args4.odesc.desc(), args4.odesc.desc(),
transformed_dy_channel + i * group_offset_out, transformed_dy_channel + i * group_offset_out,
args4.cdesc.desc(), args4.cdesc.desc(),
data_algo, data_result.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&beta, &beta,
......
...@@ -322,17 +322,16 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -322,17 +322,16 @@ void ConvCudnnGradKernel(const Context& ctx,
int group_offset_in = i_c / groups * i_h * i_w * i_d; int group_offset_in = i_c / groups * i_h * i_w * i_d;
int group_offset_out = o_c / groups * o_h * o_w * o_d; int group_offset_out = o_c / groups * o_h * o_w * o_d;
int group_offset_filter = transformed_filter_channel.numel() / groups; int group_offset_filter = transformed_filter_channel.numel() / groups;
// ------------------- cudnn backward algorithm --------------------- // ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t data_algo = paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
static_cast<miopenConvBwdDataAlgorithm_t>(0); paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
miopenConvBwdWeightsAlgorithm_t filter_algo = filter_result;
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else #else
cudnnConvolutionBwdDataAlgo_t data_algo = paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
static_cast<cudnnConvolutionBwdDataAlgo_t>(0); paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
cudnnConvolutionBwdFilterAlgo_t filter_algo = filter_result;
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif #endif
// input data workspace_size // input data workspace_size
size_t workspace_size_d = 0; size_t workspace_size_d = 0;
...@@ -368,14 +367,14 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -368,14 +367,14 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size_d = workspace_size_d =
std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); std::max(workspace_size_d, search1::GetWorkspaceSize(args1));
data_algo = search1::Find<T>( bwd_result.algo = search1::Find<T>(
args1, exhaustive_search, deterministic, workspace_size_d, ctx); args1, exhaustive_search, deterministic, workspace_size_d, ctx);
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_algo = search1::Find<T>(args1, exhaustive_search, deterministic, ctx); bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
workspace_size_d = workspace_size_d = std::max(
std::max(workspace_size_d, search1::GetWorkspaceSize(args1, data_algo)); workspace_size_d, search1::GetWorkspaceSize(args1, bwd_result.algo));
#endif #endif
} }
...@@ -397,15 +396,17 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -397,15 +396,17 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size_w = workspace_size_w =
std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); std::max(workspace_size_w, search2::GetWorkspaceSize(args2));
filter_algo = search2::Find<T>( filter_result.algo = search2::Find<T>(
args2, exhaustive_search, deterministic, workspace_size_w, ctx); args2, exhaustive_search, deterministic, workspace_size_w, ctx);
#else #else
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = filter_result =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx); search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
workspace_size_w = std::max(workspace_size_w, VLOG(3) << "filter algo: " << filter_result.algo << ", time "
search2::GetWorkspaceSize(args2, filter_algo)); << filter_result.time;
workspace_size_w = std::max(
workspace_size_w, search2::GetWorkspaceSize(args2, filter_result.algo));
#endif #endif
} }
...@@ -439,7 +440,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -439,7 +440,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args1.wdesc.desc(), args1.wdesc.desc(),
filter_data, filter_data,
args1.cdesc.desc(), args1.cdesc.desc(),
data_algo, bwd_result.algo,
&beta, &beta,
args1.idesc.desc(), args1.idesc.desc(),
temp_tensor_data, temp_tensor_data,
...@@ -471,7 +472,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -471,7 +472,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args1.wdesc.desc(), args1.wdesc.desc(),
filter_data, filter_data,
args1.cdesc.desc(), args1.cdesc.desc(),
data_algo, bwd_result.algo,
&beta, &beta,
args1.idesc.desc(), args1.idesc.desc(),
transformed_input_grad_data, transformed_input_grad_data,
...@@ -494,7 +495,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -494,7 +495,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args1.odesc.desc(), args1.odesc.desc(),
output_grad_data + i * group_offset_out, output_grad_data + i * group_offset_out,
args1.cdesc.desc(), args1.cdesc.desc(),
data_algo, bwd_result.algo,
cudnn_workspace_ptr, cudnn_workspace_ptr,
workspace_size_d, workspace_size_d,
&beta, &beta,
...@@ -554,7 +555,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -554,7 +555,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args2.idesc.desc(), args2.idesc.desc(),
input_data, input_data,
args2.cdesc.desc(), args2.cdesc.desc(),
filter_algo, filter_result.algo,
&beta, &beta,
args2.wdesc.desc(), args2.wdesc.desc(),
filter_grad_data, filter_grad_data,
...@@ -575,7 +576,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -575,7 +576,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args2.odesc.desc(), args2.odesc.desc(),
output_grad_data + i * group_offset_out, output_grad_data + i * group_offset_out,
args2.cdesc.desc(), args2.cdesc.desc(),
filter_algo, filter_result.algo,
cudnn_workspace_ptr, cudnn_workspace_ptr,
workspace_size_w, workspace_size_w,
&beta_filter, &beta_filter,
......
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/eigen.h"
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h" #include "paddle/fluid/operators/conv_miopen_helper.h"
#else #else
...@@ -68,7 +67,6 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -68,7 +67,6 @@ void ConvCudnnKernel(const Context& ctx,
"FLAGS_cudnn_deterministic True at same time.")); "FLAGS_cudnn_deterministic True at same time."));
const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
auto dtype = paddle::platform::CudnnDataType<T>::type; auto dtype = paddle::platform::CudnnDataType<T>::type;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -309,17 +307,17 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -309,17 +307,17 @@ void ConvCudnnKernel(const Context& ctx,
size_t workspace_size = 0; // final workspace to allocate. size_t workspace_size = 0; // final workspace to allocate.
// ------------------- cudnn conv algorithm --------------------- // ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t algo{}; paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>; using search = paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = search::GetWorkspaceSize(args); workspace_size = search::GetWorkspaceSize(args);
algo = search::Find<T>( fwd_result.algo = search::Find<T>(
args, exhaustive_search, deterministic, workspace_size, ctx); args, exhaustive_search, deterministic, workspace_size, ctx);
#else #else
cudnnConvolutionFwdAlgo_t algo{}; paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search = using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
algo = search::Find<T>(args, exhaustive_search, deterministic, ctx); fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx);
workspace_size = search::GetWorkspaceSize(args, algo); workspace_size = search::GetWorkspaceSize(args, fwd_result.algo);
#endif #endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
...@@ -328,7 +326,7 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -328,7 +326,7 @@ void ConvCudnnKernel(const Context& ctx,
// in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
// FWD_ALGO_IMPLICIT_GEMM manually. // FWD_ALGO_IMPLICIT_GEMM manually.
if (groups > 1) { if (groups > 1) {
algo = static_cast<cudnnConvolutionFwdAlgo_t>(0); fwd_result.algo = static_cast<cudnnConvolutionFwdAlgo_t>(0);
} }
#endif #endif
...@@ -352,7 +350,7 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -352,7 +350,7 @@ void ConvCudnnKernel(const Context& ctx,
args.wdesc.desc(), args.wdesc.desc(),
filter_data, filter_data,
args.cdesc.desc(), args.cdesc.desc(),
algo, fwd_result.algo,
&beta, &beta,
args.odesc.desc(), args.odesc.desc(),
output_data, output_data,
...@@ -373,7 +371,7 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -373,7 +371,7 @@ void ConvCudnnKernel(const Context& ctx,
args.wdesc.desc(), args.wdesc.desc(),
filter_data + i * group_offset_filter, filter_data + i * group_offset_filter,
args.cdesc.desc(), args.cdesc.desc(),
algo, fwd_result.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&beta, &beta,
......
...@@ -188,11 +188,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -188,11 +188,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
dtype}; dtype};
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t data_algo{}; paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
miopenConvBwdWeightsAlgorithm_t filter_algo{}; paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
filter_result;
#else #else
cudnnConvolutionFwdAlgo_t data_algo{}; paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
cudnnConvolutionBwdFilterAlgo_t filter_algo{}; paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
filter_result;
#endif #endif
auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout); auto layout_tensor = paddle::platform::GetCudnnTensorFormat(layout);
...@@ -218,14 +220,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -218,14 +220,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
using search1 = using search1 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
data_algo = fwd_result.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search1::Find<T>(args1, false, deterministic, ctx); fwd_result = search1::Find<T>(args1, false, deterministic, ctx);
workspace_size = workspace_size = std::max(
std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
#endif #endif
} }
...@@ -245,14 +247,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -245,14 +247,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
using search2 = using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
filter_algo = filter_result.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search2::Find<T>(args2, false, deterministic, ctx); filter_result = search2::Find<T>(args2, false, deterministic, ctx);
workspace_size = workspace_size = std::max(
std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
#endif #endif
} }
...@@ -278,7 +280,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -278,7 +280,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args1.wdesc.desc(), args1.wdesc.desc(),
filter_data + filter_offset * g, filter_data + filter_offset * g,
args1.cdesc.desc(), args1.cdesc.desc(),
data_algo, fwd_result.algo,
&beta, &beta,
args1.odesc.desc(), args1.odesc.desc(),
dx_data + x_offset * g, dx_data + x_offset * g,
...@@ -295,7 +297,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -295,7 +297,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args1.wdesc.desc(), args1.wdesc.desc(),
filter_data + filter_offset * g, filter_data + filter_offset * g,
args1.cdesc.desc(), args1.cdesc.desc(),
data_algo, fwd_result.algo,
cudnn_workspace, cudnn_workspace,
workspace_size, workspace_size,
&beta, &beta,
...@@ -338,7 +340,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -338,7 +340,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args2.idesc.desc(), args2.idesc.desc(),
dout_data + dout_offset * g, dout_data + dout_offset * g,
args2.cdesc.desc(), args2.cdesc.desc(),
filter_algo, filter_result.algo,
&beta, &beta,
args2.wdesc.desc(), args2.wdesc.desc(),
dfilter_data + filter_offset * g, dfilter_data + filter_offset * g,
...@@ -355,7 +357,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -355,7 +357,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args2.odesc.desc(), args2.odesc.desc(),
x_data + x_offset * g, x_data + x_offset * g,
args2.cdesc.desc(), args2.cdesc.desc(),
filter_algo, filter_result.algo,
cudnn_workspace, cudnn_workspace,
workspace_size, workspace_size,
&beta, &beta,
...@@ -653,22 +655,17 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -653,22 +655,17 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dilations_, dilations_,
dtype}; dtype};
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t bwd_algo1 = paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
static_cast<miopenConvBwdDataAlgorithm_t>(0); paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
miopenConvBwdDataAlgorithm_t bwd_algo2 = paddle::operators::SearchResult<miopenConvBwdWeightsAlgorithm_t>
static_cast<miopenConvBwdDataAlgorithm_t>(0); filter_result;
miopenConvFwdAlgorithm_t data_algo = static_cast<miopenConvFwdAlgorithm_t>(0); paddle::operators::SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
miopenConvBwdWeightsAlgorithm_t filter_algo =
static_cast<miopenConvBwdWeightsAlgorithm_t>(0);
#else #else
cudnnConvolutionBwdDataAlgo_t bwd_algo1 = paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
static_cast<cudnnConvolutionBwdDataAlgo_t>(0); paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
cudnnConvolutionBwdDataAlgo_t bwd_algo2 = paddle::operators::SearchResult<cudnnConvolutionBwdFilterAlgo_t>
static_cast<cudnnConvolutionBwdDataAlgo_t>(0); filter_result;
cudnnConvolutionFwdAlgo_t data_algo = paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
static_cast<cudnnConvolutionFwdAlgo_t>(0);
cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
#endif #endif
auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW); auto layout = paddle::platform::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
...@@ -696,13 +693,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -696,13 +693,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using search1 = using search1 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = search1::GetWorkspaceSize(args1); workspace_size = search1::GetWorkspaceSize(args1);
bwd_algo1 = bwd_result1.algo =
search1::Find<T>(args1, false, deterministic, workspace_size, ctx); search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx); bwd_result1 = search1::Find<T>(args1, false, deterministic, ctx);
workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
#endif #endif
ddfilter_ = ddfilter.data<T>(); ddfilter_ = ddfilter.data<T>();
...@@ -720,14 +717,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -720,14 +717,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using search2 = using search2 =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
bwd_algo2 = bwd_result2.algo =
search2::Find<T>(args2, false, deterministic, workspace_size, ctx); search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
#else #else
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx); bwd_result2 = search2::Find<T>(args2, false, deterministic, ctx);
workspace_size = workspace_size = std::max(
std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
#endif #endif
} }
...@@ -736,9 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -736,9 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args3.handle = handle; args3.handle = handle;
args3.idesc.set(transformed_dout, iwo_group); args3.idesc.set(transformed_dout, iwo_group);
args3.wdesc.set(*dfilter, layout, iwo_group); args3.wdesc.set(*dfilter, layout, iwo_group);
args3.odesc.set(transformed_ddx_channel, iwo_group); args3.odesc.set(transformed_ddx_channel, iwo_group);
args3.cdesc.set(dtype, args3.cdesc.set(dtype,
padding_common, padding_common,
strides, strides,
...@@ -749,14 +744,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -749,14 +744,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using search3 = using search3 =
paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
filter_algo = filter_result.algo =
search3::Find<T>(args3, false, deterministic, workspace_size, ctx); search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
#else #else
using search3 = using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_algo = search3::Find<T>(args3, false, deterministic, ctx); filter_result = search3::Find<T>(args3, false, deterministic, ctx);
workspace_size = workspace_size = std::max(
std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
#endif #endif
} }
...@@ -777,14 +772,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -777,14 +772,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using search4 = using search4 =
paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvFwdAlgorithm_t>;
workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
data_algo = fwd_result.algo =
search4::Find<T>(args4, false, deterministic, workspace_size, ctx); search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
#else #else
using search4 = using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
data_algo = search4::Find<T>(args4, false, deterministic, ctx); fwd_result = search4::Find<T>(args4, false, deterministic, ctx);
workspace_size = workspace_size = std::max(
std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
#endif #endif
} }
...@@ -831,7 +826,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -831,7 +826,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args1.wdesc.desc(), args1.wdesc.desc(),
filter_ + i * group_offset_filter, filter_ + i * group_offset_filter,
args1.cdesc.desc(), args1.cdesc.desc(),
bwd_algo1, bwd_result1.algo,
&beta, &beta,
args1.idesc.desc(), args1.idesc.desc(),
transformed_ddout_channel_ + i * group_offset_out, transformed_ddout_channel_ + i * group_offset_out,
...@@ -850,7 +845,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -850,7 +845,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args1.odesc.desc(), args1.odesc.desc(),
ddx_ + i * group_offset_in, ddx_ + i * group_offset_in,
args1.cdesc.desc(), args1.cdesc.desc(),
bwd_algo1, bwd_result1.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&beta, &beta,
...@@ -877,7 +872,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -877,7 +872,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args2.wdesc.desc(), args2.wdesc.desc(),
ddfilter_ + i * group_offset_filter, ddfilter_ + i * group_offset_filter,
args2.cdesc.desc(), args2.cdesc.desc(),
bwd_algo2, bwd_result2.algo,
&beta, &beta,
args2.idesc.desc(), args2.idesc.desc(),
conv_x_ddfilter_data + i * group_offset_out, conv_x_ddfilter_data + i * group_offset_out,
...@@ -908,7 +903,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -908,7 +903,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args2.odesc.desc(), args2.odesc.desc(),
x_ + i * group_offset_in, x_ + i * group_offset_in,
args2.cdesc.desc(), args2.cdesc.desc(),
bwd_algo2, bwd_result2.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&alpha, &alpha,
...@@ -964,7 +959,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -964,7 +959,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args3.idesc.desc(), args3.idesc.desc(),
transformed_dout_channel_ + i * group_offset_out, transformed_dout_channel_ + i * group_offset_out,
args3.cdesc.desc(), args3.cdesc.desc(),
filter_algo, filter_result.algo,
&beta, &beta,
args3.wdesc.desc(), args3.wdesc.desc(),
dfilter_ + i * group_offset_filter, dfilter_ + i * group_offset_filter,
...@@ -983,7 +978,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -983,7 +978,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args3.odesc.desc(), args3.odesc.desc(),
ddx_ + i * group_offset_in, ddx_ + i * group_offset_in,
args3.cdesc.desc(), args3.cdesc.desc(),
filter_algo, filter_result.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&beta, &beta,
...@@ -1009,7 +1004,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -1009,7 +1004,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args4.wdesc.desc(), args4.wdesc.desc(),
ddfilter_ + i * group_offset_filter, ddfilter_ + i * group_offset_filter,
args4.cdesc.desc(), args4.cdesc.desc(),
data_algo, fwd_result.algo,
&beta, &beta,
args4.odesc.desc(), args4.odesc.desc(),
transformed_dx_ + i * group_offset_in, transformed_dx_ + i * group_offset_in,
...@@ -1028,7 +1023,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -1028,7 +1023,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args4.wdesc.desc(), args4.wdesc.desc(),
ddfilter_ + i * group_offset_filter, ddfilter_ + i * group_offset_filter,
args4.cdesc.desc(), args4.cdesc.desc(),
data_algo, fwd_result.algo,
workspace_ptr, workspace_ptr,
workspace_size, workspace_size,
&beta, &beta,
......
...@@ -217,16 +217,19 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -217,16 +217,19 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
c_groups); c_groups);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
paddle::operators::SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
using search = using search =
paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>; paddle::operators::SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
algo = search::Find<T>(args, false, deterministic, workspace_size, ctx); bwd_result.algo =
search::Find<T>(args, false, deterministic, workspace_size, ctx);
#else #else
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search = using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
algo = search::Find<T>(args, false, deterministic, ctx); bwd_result = search::Find<T>(args, false, deterministic, ctx);
workspace_size = workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, algo)); std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
#endif #endif
// ------------------- cudnn conv transpose forward --------------------- // ------------------- cudnn conv transpose forward ---------------------
...@@ -247,7 +250,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -247,7 +250,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
args.wdesc.desc(), args.wdesc.desc(),
filter_data + filter_offset * g, filter_data + filter_offset * g,
args.cdesc.desc(), args.cdesc.desc(),
algo, bwd_result.algo,
&beta, &beta,
args.idesc.desc(), args.idesc.desc(),
transformed_out_data + out_offset * g, transformed_out_data + out_offset * g,
...@@ -264,7 +267,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -264,7 +267,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
args.odesc.desc(), args.odesc.desc(),
x_data + x_offset * g, x_data + x_offset * g,
args.cdesc.desc(), args.cdesc.desc(),
algo, bwd_result.algo,
cudnn_workspace, cudnn_workspace,
workspace_size, workspace_size,
&beta, &beta,
......
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
#include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
DECLARE_bool(cudnn_deterministic); DECLARE_bool(cudnn_deterministic);
DECLARE_uint64(conv_workspace_size_limit); DECLARE_int64(conv_workspace_size_limit);
DECLARE_bool(cudnn_exhaustive_search); DECLARE_bool(cudnn_exhaustive_search);
namespace phi { namespace phi {
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
import paddle import paddle
import unittest import unittest
import numpy import numpy as np
class SimpleNet(paddle.nn.Layer): class SimpleNet(paddle.nn.Layer):
...@@ -27,6 +27,7 @@ class SimpleNet(paddle.nn.Layer): ...@@ -27,6 +27,7 @@ class SimpleNet(paddle.nn.Layer):
def train_dygraph(net, data): def train_dygraph(net, data):
data.stop_gradient = False
out = net(data) out = net(data)
loss = paddle.mean(out) loss = paddle.mean(out)
adam = paddle.optimizer.Adam(parameters=net.parameters()) adam = paddle.optimizer.Adam(parameters=net.parameters())
...@@ -36,6 +37,7 @@ def train_dygraph(net, data): ...@@ -36,6 +37,7 @@ def train_dygraph(net, data):
def static_program(net, data): def static_program(net, data):
data.stop_gradient = False
out = net(data) out = net(data)
loss = paddle.mean(out) loss = paddle.mean(out)
adam = paddle.optimizer.Adam() adam = paddle.optimizer.Adam()
...@@ -44,48 +46,63 @@ def static_program(net, data): ...@@ -44,48 +46,63 @@ def static_program(net, data):
class TestAutoTune(unittest.TestCase): class TestAutoTune(unittest.TestCase):
def set_flags(self, enable_autotune):
if paddle.is_compiled_with_cuda():
if enable_autotune:
paddle.set_flags({'FLAGS_conv_workspace_size_limit': -1})
else:
paddle.set_flags({'FLAGS_conv_workspace_size_limit': 512})
def get_flags(self, name):
res = paddle.get_flags(name)
return res[name]
def get_expected_res(self, step_id, enable_autotune):
expected_res = {
"step_id": step_id,
"cache_size": 0,
"cache_hit_rate": 0
}
if paddle.is_compiled_with_cuda():
# Total 3 * num_iters cache accesses, only iter 2 hits the cache.
if enable_autotune and step_id >= 1:
expected_res["cache_size"] = 3
if enable_autotune and step_id == 2:
expected_res["cache_hit_rate"] = np.round(
float(3) / float(9), 5)
return expected_res
def test_autotune(self): def test_autotune(self):
paddle.fluid.core.disable_autotune() paddle.fluid.core.disable_autotune()
status = paddle.fluid.core.autotune_status() self.assertEqual(self.get_flags("FLAGS_use_autotune"), False)
self.assertEqual(status["use_autotune"], False)
paddle.fluid.core.enable_autotune() paddle.fluid.core.enable_autotune()
status = paddle.fluid.core.autotune_status() self.assertEqual(self.get_flags("FLAGS_use_autotune"), True)
self.assertEqual(status["use_autotune"], True)
def check_status(self, expected_res): def check_status(self, expected_res):
status = paddle.fluid.core.autotune_status() status = paddle.fluid.core.autotune_status()
for key in status.keys(): for key in status.keys():
self.assertEqual(status[key], expected_res[key]) if key == "cache_hit_rate":
v = np.round(status[key], 5)
else:
v = status[key]
self.assertEqual(v, expected_res[key])
class TestDygraphAutoTuneStatus(TestAutoTune): class TestDygraphAutoTuneStatus(TestAutoTune):
def run_program(self, enable_autotune): def run_program(self, enable_autotune):
self.set_flags(enable_autotune)
if enable_autotune: if enable_autotune:
paddle.fluid.core.enable_autotune() paddle.fluid.core.enable_autotune()
else: else:
paddle.fluid.core.disable_autotune() paddle.fluid.core.disable_autotune()
paddle.fluid.core.autotune_range(1, 2) paddle.fluid.core.set_autotune_range(1, 2)
x_var = paddle.uniform((1, 1, 8, 8), dtype='float32', min=-1., max=1.) x_var = paddle.uniform((1, 1, 8, 8), dtype='float32', min=-1., max=1.)
net = SimpleNet() net = SimpleNet()
for i in range(3): for i in range(3):
train_dygraph(net, x_var) train_dygraph(net, x_var)
if i >= 1 and i < 2: expected_res = self.get_expected_res(i, enable_autotune)
expected_res = { self.check_status(expected_res)
"step_id": i,
"use_autotune": enable_autotune,
"cache_size": 0,
"cache_hit_rate": 0
}
self.check_status(expected_res)
else:
expected_res = {
"step_id": i,
"use_autotune": False,
"cache_size": 0,
"cache_hit_rate": 0
}
self.check_status(expected_res)
def func_enable_autotune(self): def func_enable_autotune(self):
self.run_program(enable_autotune=True) self.run_program(enable_autotune=True)
...@@ -107,59 +124,45 @@ class TestDygraphAutoTuneStatus(TestAutoTune): ...@@ -107,59 +124,45 @@ class TestDygraphAutoTuneStatus(TestAutoTune):
class TestStaticAutoTuneStatus(TestAutoTune): class TestStaticAutoTuneStatus(TestAutoTune):
def run_program(self, enable_autotune): def run_program(self, enable_autotune):
paddle.enable_static() paddle.enable_static()
if enable_autotune:
paddle.fluid.core.enable_autotune()
else:
paddle.fluid.core.disable_autotune()
paddle.fluid.core.autotune_range(1, 2)
data_shape = [1, 1, 8, 8] data_shape = [1, 1, 8, 8]
data = paddle.static.data(name='X', shape=data_shape, dtype='float32') main_program = paddle.static.Program()
net = SimpleNet() startup_program = paddle.static.Program()
loss = static_program(net, data) with paddle.static.program_guard(main_program, startup_program):
data = paddle.static.data(
name='X', shape=data_shape, dtype='float32')
net = SimpleNet()
loss = static_program(net, data)
place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda( place = paddle.CUDAPlace(0) if paddle.fluid.core.is_compiled_with_cuda(
) else paddle.CPUPlace() ) else paddle.CPUPlace()
exe = paddle.static.Executor(place) exe = paddle.static.Executor(place)
exe.run(paddle.static.default_startup_program()) exe.run(startup_program)
x = numpy.random.random(size=data_shape).astype('float32') x = np.random.random(size=data_shape).astype('float32')
self.set_flags(enable_autotune)
if enable_autotune:
paddle.fluid.core.enable_autotune()
else:
paddle.fluid.core.disable_autotune()
paddle.fluid.core.set_autotune_range(1, 2)
for i in range(3): for i in range(3):
exe.run(feed={'X': x}, fetch_list=[loss]) exe.run(program=main_program, feed={'X': x}, fetch_list=[loss])
status = paddle.fluid.core.autotune_status() status = paddle.fluid.core.autotune_status()
# In static mode, the startup_program will run at first. expected_res = self.get_expected_res(i, enable_autotune)
# The expected step_id will be increased by 1. self.check_status(expected_res)
if i >= 0 and i < 1:
expected_res = {
"step_id": i + 1,
"use_autotune": enable_autotune,
"cache_size": 0,
"cache_hit_rate": 0
}
self.check_status(expected_res)
else:
expected_res = {
"step_id": i + 1,
"use_autotune": False,
"cache_size": 0,
"cache_hit_rate": 0
}
self.check_status(expected_res)
paddle.disable_static() paddle.disable_static()
def func_enable_autotune(self): def func_enable_autotune(self):
self.run_program(enable_autotune=True) self.run_program(enable_autotune=True)
def test_enable_autotune(self): def test_enable_autotune(self):
with paddle.fluid.framework._test_eager_guard():
self.func_enable_autotune()
self.func_enable_autotune() self.func_enable_autotune()
def func_disable_autotune(self): def func_disable_autotune(self):
self.run_program(enable_autotune=False) self.run_program(enable_autotune=False)
def test_disable_autotune(self): def test_disable_autotune(self):
with paddle.fluid.framework._test_eager_guard():
self.func_disable_autotune()
self.func_disable_autotune() self.func_disable_autotune()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册