未验证 提交 c0ed8729 编写于 作者: Y Yiqun Liu 提交者: GitHub

[Cherry-pick] Simplify conv codes and fix cache and autotune bugs. (#47197)

* Simplify the codes of conv. (#45966)

* Enable to record whether the conv algo is got by exhaustive search to fix autotune cache bug. (#47065)
上级 50d4fa54
......@@ -36,17 +36,10 @@ using framework::ConvSearchCache;
template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
// As the basic for SearchAlgorithm struct.
template <typename PerfT>
struct SearchAlgorithm {};
// As the container of searchAlgorithm::Find() result.
template <typename AlgoT>
struct SearchResult {
SearchResult() {}
explicit SearchResult(const phi::autotune::DnnNode& node)
: algo(static_cast<AlgoT>(node.algo)),
workspace_size(node.workspace_size) {}
explicit SearchResult(AlgoT a) : algo(a) {}
explicit SearchResult(AlgoT a, float t, size_t size)
......@@ -55,12 +48,21 @@ struct SearchResult {
AlgoT algo = static_cast<AlgoT>(0);
float time = -1.f;
size_t workspace_size = 0;
bool exhaustive_search = false;
};
template <typename T>
static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
out << "[";
for (auto const& tmp : v) out << tmp << ",";
bool is_first = true;
for (auto const& tmp : v) {
if (is_first) {
out << tmp;
is_first = false;
} else {
out << ", " << tmp;
}
}
out << "]";
return out;
}
......@@ -113,7 +115,7 @@ struct ConvArgsBase {
auto w_shape = phi::vectorize(w->dims());
VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
<< ", strides=" << s << ", paddings=" << p << ", dilations=" << d
<< ",data= " << paddle::experimental::CppTypeToDataType<T>::Type()
<< ", data=" << paddle::experimental::CppTypeToDataType<T>::Type()
<< ", group=" << group
<< ", data layout=" << static_cast<int64_t>(data_layout);
......
......@@ -14,12 +14,11 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace paddle {
......@@ -53,11 +52,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
}
auto in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*input);
auto out_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
auto out_t = phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents);
......@@ -147,83 +144,21 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
}
}
static void SetConvMathType(const phi::GPUContext& ctx,
cudnnDataType_t dtype,
const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
} else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
} else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_FMA_MATH));
#endif // CUDA_VERSION >= 11000
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math";
}
#endif
}
template <typename PerfT>
struct SearchAlgorithmBase {};
// cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to
// the chosen alogrithm.
template <>
struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
using PerfT = cudnnConvolutionFwdAlgoPerf_t;
using AlgoT = cudnnConvolutionFwdAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvForward;
template <typename T>
static SearchResult<AlgoT> Find(const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
if (cache.Find(key)) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static const std::string GetPerfName() { return "ConvForward"; }
static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionFwdAlgo_t algo) {
......@@ -240,7 +175,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
return workspace_size;
}
private:
protected:
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
......@@ -272,6 +207,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000
VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
perf_results,
actual_perf_count,
workspace_size_limit);
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result);
......@@ -388,53 +327,13 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
template <>
struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdDataAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardData;
template <typename T>
static SearchResult<AlgoT> Find(const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
if (cache.Find(key)) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static const std::string GetPerfName() { return "ConvBackwardData"; }
static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdDataAlgo_t algo) {
......@@ -451,7 +350,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
return workspace_size;
}
private:
protected:
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size =
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
......@@ -610,54 +509,13 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
template <>
struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardFilter;
template <typename T>
static SearchResult<AlgoT> Find(const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
const phi::GPUContext& ctx) {
platform::CUDAGraphCaptureModeGuard guard;
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
if (cache.Find(key)) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static const std::string GetPerfName() { return "ConvBackwardFilter"; }
static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdFilterAlgo_t algo) {
......@@ -675,7 +533,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
return workspace_size;
}
private:
protected:
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size =
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
......@@ -892,5 +750,103 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
}
};
template <typename PerfT>
struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;
template <typename T>
static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
bool enable_autotune = true) {
SearchResult<AlgoT> result;
bool use_autotune = false;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, run heuristic (default) before
// auto-tune process, run exhaustive_search during mentioned process.
// Auto tune is only enabled between specified range.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
SearchAlgorithmBase<PerfT>::kAlgoType);
bool find_in_cache = cache.Find(key);
if (find_in_cache) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
result.exhaustive_search = t.exhaustive_search;
}
if (!result.exhaustive_search) {
bool need_update_cache = false;
// In conv2d_tranpose, enable_autotune is set to false because some
// algorithm picked by exhaustive search method produce wrong result.
use_autotune = enable_autotune &&
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
// Once autotune is enabled, the autotuned result can rewrite the
// previous result in cache found by heuristic method.
result =
SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
args, ctx);
need_update_cache = true;
} else if (!find_in_cache) {
result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
need_update_cache = true;
}
if (need_update_cache) {
phi::autotune::ConvAutoTuneResult node(
static_cast<int64_t>(result.algo),
result.workspace_size,
exhaustive_search || use_autotune);
cache.Set(key, node);
}
}
}
VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
<< "] exhaustive_search=" << exhaustive_search
<< ", use_autotune=" << use_autotune
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static void SetConvMathType(const phi::GPUContext& ctx,
cudnnDataType_t dtype,
const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
} else if (ctx.GetComputeCapability() >= 80 &&
dtype == CUDNN_DATA_BFLOAT16) {
VLOG(5) << "Enable Tensor Core for BFLOAT16";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
} else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_FMA_MATH));
#endif // CUDA_VERSION >= 11000
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH));
}
#endif
}
};
} // namespace operators
} // namespace paddle
......@@ -56,6 +56,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
out_t.device(place) = in_t.slice(offsets, extents);
}
template <typename PerfT>
struct SearchAlgorithm {};
template <>
struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
using perf_t = miopenConvAlgoPerf_t;
......
......@@ -56,12 +56,14 @@ struct hash<std::vector<T>> {
namespace phi {
namespace autotune {
struct DnnNode {
DnnNode() {}
explicit DnnNode(int64_t a, size_t size) : algo(a), workspace_size(size) {}
struct ConvAutoTuneResult {
ConvAutoTuneResult() {}
ConvAutoTuneResult(int64_t a, size_t size, bool search)
: algo(a), workspace_size(size), exhaustive_search(search) {}
int64_t algo;
size_t workspace_size = 0;
bool exhaustive_search = false;
};
template <typename... Args>
......@@ -73,40 +75,41 @@ size_t GetKey(Args&&... args) {
struct ConvCacheKey {
ConvCacheKey() {}
explicit ConvCacheKey(const std::vector<int64_t>& x_dims,
const std::vector<int64_t>& w_dims,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
phi::DataType dtype,
int groups,
int64_t data_layout)
: x_dims_(x_dims),
w_dims_(w_dims),
strides_(strides),
paddings_(paddings),
dilations_(dilations),
dtype_(dtype),
groups_(groups),
data_layout_(data_layout) {}
ConvCacheKey(const std::vector<int64_t>& arg_x_dims,
const std::vector<int64_t>& arg_w_dims,
const std::vector<int>& arg_strides,
const std::vector<int>& arg_paddings,
const std::vector<int>& arg_dilations,
phi::DataType arg_dtype,
int arg_groups,
int64_t arg_data_layout)
: x_dims(arg_x_dims),
w_dims(arg_w_dims),
strides(arg_strides),
paddings(arg_paddings),
dilations(arg_dilations),
dtype(arg_dtype),
groups(arg_groups),
data_layout(arg_data_layout) {}
size_t hash_value() const {
return GetKey(x_dims_,
w_dims_,
strides_,
paddings_,
dilations_,
static_cast<int64_t>(dtype_),
groups_,
data_layout_);
return GetKey(x_dims,
w_dims,
strides,
paddings,
dilations,
static_cast<int64_t>(dtype),
groups,
data_layout);
}
std::vector<int64_t> x_dims_;
std::vector<int64_t> w_dims_;
std::vector<int> strides_;
std::vector<int> paddings_;
std::vector<int> dilations_;
phi::DataType dtype_;
int groups_;
int64_t data_layout_;
std::vector<int64_t> x_dims;
std::vector<int64_t> w_dims;
std::vector<int> strides;
std::vector<int> paddings;
std::vector<int> dilations;
phi::DataType dtype;
int groups;
int64_t data_layout;
};
struct ConvCacheKeyHash {
......@@ -118,14 +121,14 @@ struct ConvCacheKeyHash {
struct ConvCacheKeyEqual {
size_t operator()(const ConvCacheKey& first,
const ConvCacheKey& second) const {
if (first.x_dims_ != second.x_dims_) return false;
if (first.w_dims_ != second.w_dims_) return false;
if (first.strides_ != second.strides_) return false;
if (first.paddings_ != second.paddings_) return false;
if (first.dilations_ != second.dilations_) return false;
if (first.dtype_ != second.dtype_) return false;
if (first.groups_ != second.groups_) return false;
if (first.data_layout_ != second.data_layout_) return false;
if (first.x_dims != second.x_dims) return false;
if (first.w_dims != second.w_dims) return false;
if (first.strides != second.strides) return false;
if (first.paddings != second.paddings) return false;
if (first.dilations != second.dilations) return false;
if (first.dtype != second.dtype) return false;
if (first.groups != second.groups) return false;
if (first.data_layout != second.data_layout) return false;
return true;
}
......@@ -135,7 +138,7 @@ class CudnnAlgorithmsCacheMap {
public:
CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); }
DnnNode Get(const ConvCacheKey& key) {
ConvAutoTuneResult Get(const ConvCacheKey& key) {
std::lock_guard<std::mutex> lock(*cache_mutex_);
PADDLE_ENFORCE_NE(
hash_.find(key),
......@@ -163,7 +166,7 @@ class CudnnAlgorithmsCacheMap {
cache_misses_ = 0;
}
void Set(const ConvCacheKey& key, DnnNode algo) {
void Set(const ConvCacheKey& key, ConvAutoTuneResult algo) {
std::lock_guard<std::mutex> lock(*cache_mutex_);
if (hash_.size() > static_cast<size_t>(FLAGS_search_cache_max_number)) {
hash_.clear();
......@@ -188,7 +191,10 @@ class CudnnAlgorithmsCacheMap {
int64_t Size() const { return hash_.size(); }
private:
std::unordered_map<ConvCacheKey, DnnNode, ConvCacheKeyHash, ConvCacheKeyEqual>
std::unordered_map<ConvCacheKey,
ConvAutoTuneResult,
ConvCacheKeyHash,
ConvCacheKeyEqual>
hash_;
std::shared_ptr<std::mutex> cache_mutex_;
......@@ -289,19 +295,8 @@ class AutoTuneCache {
return auto_tune_map_[static_cast<int64_t>(algo_type)];
}
CudnnAlgorithmsCacheMap& GetConvForward() {
return cudnn_auto_tune_map_[static_cast<int64_t>(
AlgorithmType::kConvForward)];
}
CudnnAlgorithmsCacheMap& GetConvBackwardData() {
return cudnn_auto_tune_map_[static_cast<int64_t>(
AlgorithmType::kConvBackwardData)];
}
CudnnAlgorithmsCacheMap& GetConvBackwardFilter() {
return cudnn_auto_tune_map_[static_cast<int64_t>(
AlgorithmType::kConvBackwardFilter)];
CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
return cudnn_auto_tune_map_[static_cast<int64_t>(algo_type)];
}
AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
......
......@@ -25,7 +25,8 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
TEST(AlgosCache, AlgosCache) {
auto autotune_cache = phi::autotune::AutoTuneCache::Instance();
auto& cache = autotune_cache.GetConvForward();
auto& cache =
autotune_cache.GetConv(phi::autotune::AlgorithmType::kConvForward);
std::vector<int64_t> x_shape = {4, 224, 224, 3};
std::vector<int64_t> w_shape = {32, 3, 3, 3};
......@@ -37,7 +38,8 @@ TEST(AlgosCache, AlgosCache) {
phi::autotune::ConvCacheKey key(
x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0);
EXPECT_EQ(cache.Find(key), false);
phi::autotune::DnnNode node(static_cast<int64_t>(ConvAlgos::GEMMKernel), 0);
phi::autotune::ConvAutoTuneResult node(
static_cast<int64_t>(ConvAlgos::GEMMKernel), 0, false);
cache.Set(key, node);
EXPECT_EQ(cache.Size(), 1);
EXPECT_EQ(cache.Find(key), true);
......@@ -48,8 +50,8 @@ TEST(AlgosCache, AlgosCache) {
phi::autotune::ConvCacheKey key1(
x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1);
EXPECT_EQ(cache.Find(key1), false);
phi::autotune::DnnNode node1(static_cast<int64_t>(ConvAlgos::CuDNNKernel_1),
0);
phi::autotune::ConvAutoTuneResult node1(
static_cast<int64_t>(ConvAlgos::CuDNNKernel_1), 0, false);
cache.Set(key1, node1);
EXPECT_EQ(cache.Size(), 2);
EXPECT_EQ(cache.CacheHits(), 1);
......
......@@ -336,7 +336,7 @@ void ConvCudnnGradGradKernel(
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
#endif
}
......@@ -364,7 +364,7 @@ void ConvCudnnGradGradKernel(
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
#endif
......@@ -394,7 +394,7 @@ void ConvCudnnGradGradKernel(
using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result =
search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
#endif
......@@ -424,7 +424,7 @@ void ConvCudnnGradGradKernel(
using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_result =
search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, data_result.algo));
#endif
......
......@@ -373,7 +373,7 @@ void ConvCudnnGradKernel(const Context& ctx,
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
#endif
}
......@@ -402,7 +402,7 @@ void ConvCudnnGradKernel(const Context& ctx,
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
VLOG(3) << "filter algo: " << filter_result.algo << ", time "
<< filter_result.time;
workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size);
......
......@@ -25,7 +25,6 @@
#endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
......@@ -56,8 +55,7 @@ void ConvCudnnKernel(const Context& ctx,
bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic;
PADDLE_ENFORCE_EQ(exhaustive_deterministic,
PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
false,
phi::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
......@@ -315,7 +313,7 @@ void ConvCudnnKernel(const Context& ctx,
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx);
fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
workspace_size = fwd_result.workspace_size;
#endif
......
......@@ -230,7 +230,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search1::Find<T>(args1, false, deterministic, ctx);
fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = std::max(
workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
#endif
......@@ -257,7 +257,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search2::Find<T>(args2, false, deterministic, ctx);
filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
#endif
......@@ -710,7 +710,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else
using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result1 = search1::Find<T>(args1, false, deterministic, ctx);
bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
#endif
......@@ -734,7 +734,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else
using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result2 = search2::Find<T>(args2, false, deterministic, ctx);
bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
#endif
......@@ -761,7 +761,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else
using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search3::Find<T>(args3, false, deterministic, ctx);
filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
#endif
......@@ -789,7 +789,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else
using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search4::Find<T>(args4, false, deterministic, ctx);
fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
#endif
......
......@@ -230,7 +230,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search::Find<T>(args, false, deterministic, ctx);
bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
#endif
......
......@@ -72,15 +72,20 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result):
"""
pr_gpu_time = pr_data.get("gpu_time")
develop_gpu_time = develop_data.get("gpu_time")
gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time
if develop_gpu_time != 0.0:
gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time
gpu_time_diff_str = "{:.5f}".format(gpu_time_diff * 100)
else:
gpu_time_diff = None
gpu_time_diff_str = ""
pr_total_time = pr_data.get("total")
develop_total_time = develop_data.get("total")
total_time_diff = (pr_total_time - develop_total_time) / develop_total_time
logging.info("------ OP: %s ------" % case_name)
logging.info("GPU time change: %.5f%% (develop: %.7f -> PR: %.7f)" %
(gpu_time_diff * 100, develop_gpu_time, pr_gpu_time))
logging.info("GPU time change: %s (develop: %.7f -> PR: %.7f)" %
(gpu_time_diff_str, develop_gpu_time, pr_gpu_time))
logging.info("Total time change: %.5f%% (develop: %.7f -> PR: %.7f)" %
(total_time_diff * 100, develop_total_time, pr_total_time))
logging.info("backward: %s" % pr_result.get("backward"))
......@@ -196,7 +201,8 @@ if __name__ == "__main__":
args.develop_logs_dir)
check_path_exists(args.pr_logs_dir)
for log_file in os.listdir(args.pr_logs_dir):
pr_log_files = os.listdir(args.pr_logs_dir)
for log_file in sorted(pr_log_files):
develop_result = develop_result_dict.get(log_file)
pr_result = parse_log_file(os.path.join(args.pr_logs_dir, log_file))
if develop_result is None or pr_result is None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册