未验证 提交 c0ed8729 编写于 作者: Y Yiqun Liu 提交者: GitHub

[Cherry-pick] Simplify conv codes and fix cache and autotune bugs. (#47197)

* Simplify the codes of conv. (#45966)

* Enable to record whether the conv algo is got by exhaustive search to fix autotune cache bug. (#47065)
上级 50d4fa54
...@@ -36,17 +36,10 @@ using framework::ConvSearchCache; ...@@ -36,17 +36,10 @@ using framework::ConvSearchCache;
template <typename T> template <typename T>
using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType; using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
// As the basic for SearchAlgorithm struct.
template <typename PerfT>
struct SearchAlgorithm {};
// As the container of searchAlgorithm::Find() result. // As the container of searchAlgorithm::Find() result.
template <typename AlgoT> template <typename AlgoT>
struct SearchResult { struct SearchResult {
SearchResult() {} SearchResult() {}
explicit SearchResult(const phi::autotune::DnnNode& node)
: algo(static_cast<AlgoT>(node.algo)),
workspace_size(node.workspace_size) {}
explicit SearchResult(AlgoT a) : algo(a) {} explicit SearchResult(AlgoT a) : algo(a) {}
explicit SearchResult(AlgoT a, float t, size_t size) explicit SearchResult(AlgoT a, float t, size_t size)
...@@ -55,12 +48,21 @@ struct SearchResult { ...@@ -55,12 +48,21 @@ struct SearchResult {
AlgoT algo = static_cast<AlgoT>(0); AlgoT algo = static_cast<AlgoT>(0);
float time = -1.f; float time = -1.f;
size_t workspace_size = 0; size_t workspace_size = 0;
bool exhaustive_search = false;
}; };
template <typename T> template <typename T>
static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) { static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
out << "["; out << "[";
for (auto const& tmp : v) out << tmp << ","; bool is_first = true;
for (auto const& tmp : v) {
if (is_first) {
out << tmp;
is_first = false;
} else {
out << ", " << tmp;
}
}
out << "]"; out << "]";
return out; return out;
} }
...@@ -113,7 +115,7 @@ struct ConvArgsBase { ...@@ -113,7 +115,7 @@ struct ConvArgsBase {
auto w_shape = phi::vectorize(w->dims()); auto w_shape = phi::vectorize(w->dims());
VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
<< ", strides=" << s << ", paddings=" << p << ", dilations=" << d << ", strides=" << s << ", paddings=" << p << ", dilations=" << d
<< ",data= " << paddle::experimental::CppTypeToDataType<T>::Type() << ", data=" << paddle::experimental::CppTypeToDataType<T>::Type()
<< ", group=" << group << ", group=" << group
<< ", data layout=" << static_cast<int64_t>(data_layout); << ", data layout=" << static_cast<int64_t>(data_layout);
......
...@@ -14,12 +14,11 @@ limitations under the License. */ ...@@ -14,12 +14,11 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/conv_base_helper.h" #include "paddle/fluid/operators/conv_base_helper.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace paddle { namespace paddle {
...@@ -53,11 +52,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context, ...@@ -53,11 +52,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
} }
auto in_t = auto in_t =
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From( phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
*input); auto out_t = phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
auto out_t = *out, new_out_dims);
framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
*out, new_out_dims);
phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval( phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
place, out_t, in_t, offsets, extents); place, out_t, in_t, offsets, extents);
...@@ -147,83 +144,21 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results, ...@@ -147,83 +144,21 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
} }
} }
static void SetConvMathType(const phi::GPUContext& ctx, template <typename PerfT>
cudnnDataType_t dtype, struct SearchAlgorithmBase {};
const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "use cudnn_tensor_op_math";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
} else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
} else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_FMA_MATH));
#endif // CUDA_VERSION >= 11000
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH));
VLOG(5) << "NOT use cudnn_tensor_op_math";
}
#endif
}
// cuDNN convolution forward algorithm searcher, consisted of three searching // cuDNN convolution forward algorithm searcher, consisted of three searching
// modes, namely: deterministic, heuristic and exhaustive_search mode. // modes, namely: deterministic, heuristic and exhaustive_search mode.
// As well as one workspace size acquirsition function with respect to // As well as one workspace size acquirsition function with respect to
// the chosen alogrithm. // the chosen alogrithm.
template <> template <>
struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
using PerfT = cudnnConvolutionFwdAlgoPerf_t; using PerfT = cudnnConvolutionFwdAlgoPerf_t;
using AlgoT = cudnnConvolutionFwdAlgo_t; using AlgoT = cudnnConvolutionFwdAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvForward;
template <typename T> static const std::string GetPerfName() { return "ConvForward"; }
static SearchResult<AlgoT> Find(const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
if (cache.Find(key)) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static size_t GetWorkspaceSize(const ConvArgs& args, static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionFwdAlgo_t algo) { cudnnConvolutionFwdAlgo_t algo) {
...@@ -240,7 +175,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -240,7 +175,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
return workspace_size; return workspace_size;
} }
private: protected:
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) { static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1)); auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size); return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
...@@ -272,6 +207,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -272,6 +207,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
if (result.workspace_size > workspace_size_limit) { if (result.workspace_size > workspace_size_limit) {
#if CUDNN_VERSION >= 8000 #if CUDNN_VERSION >= 8000
VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
perf_results,
actual_perf_count,
workspace_size_limit);
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8 // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace<PerfT, AlgoT>( ChooseAlgoByWorkspace<PerfT, AlgoT>(
perf_results, workspace_size_limit, &result); perf_results, workspace_size_limit, &result);
...@@ -388,53 +327,13 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> { ...@@ -388,53 +327,13 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
// As well as one workspace size acquirsition function with // As well as one workspace size acquirsition function with
// respect to the chosen alogrithm. // respect to the chosen alogrithm.
template <> template <>
struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
using PerfT = cudnnConvolutionBwdDataAlgoPerf_t; using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdDataAlgo_t; using AlgoT = cudnnConvolutionBwdDataAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardData;
template <typename T> static const std::string GetPerfName() { return "ConvBackwardData"; }
static SearchResult<AlgoT> Find(const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
const phi::GPUContext& ctx) {
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
if (cache.Find(key)) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static size_t GetWorkspaceSize(const ConvArgs& args, static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdDataAlgo_t algo) { cudnnConvolutionBwdDataAlgo_t algo) {
...@@ -451,7 +350,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -451,7 +350,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
return workspace_size; return workspace_size;
} }
private: protected:
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) { static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size = auto workspace_size =
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1); GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
...@@ -610,54 +509,13 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> { ...@@ -610,54 +509,13 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
// exhaustive_search mode. As well as one workspace size acquirsition function // exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm. // with respect to the chosen alogrithm.
template <> template <>
struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t; using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
using AlgoT = cudnnConvolutionBwdFilterAlgo_t; using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
constexpr static phi::autotune::AlgorithmType kAlgoType =
phi::autotune::AlgorithmType::kConvBackwardFilter;
template <typename T> static const std::string GetPerfName() { return "ConvBackwardFilter"; }
static SearchResult<AlgoT> Find(const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
const phi::GPUContext& ctx) {
platform::CUDAGraphCaptureModeGuard guard;
SearchResult<AlgoT> result;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache =
phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
if (cache.Find(key)) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
} else {
bool use_autotune =
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
result = FindAlgoExhaustiveSearch<T>(args, ctx);
} else {
result = FindAlgoHeuristic(args, ctx);
}
phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
result.workspace_size);
cache.Set(key, node);
}
}
VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static size_t GetWorkspaceSize(const ConvArgs& args, static size_t GetWorkspaceSize(const ConvArgs& args,
cudnnConvolutionBwdFilterAlgo_t algo) { cudnnConvolutionBwdFilterAlgo_t algo) {
...@@ -675,7 +533,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -675,7 +533,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
return workspace_size; return workspace_size;
} }
private: protected:
static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) { static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
auto workspace_size = auto workspace_size =
GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1); GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
...@@ -892,5 +750,103 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> { ...@@ -892,5 +750,103 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
} }
}; };
template <typename PerfT>
struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;
template <typename T>
static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
const ConvArgs& args,
bool exhaustive_search,
bool deterministic,
bool enable_autotune = true) {
SearchResult<AlgoT> result;
bool use_autotune = false;
auto dtype = platform::CudnnDataType<T>::type;
SetConvMathType(ctx, dtype, args.cdesc);
if (deterministic) {
result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
} else {
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, run heuristic (default) before
// auto-tune process, run exhaustive_search during mentioned process.
// Auto tune is only enabled between specified range.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
auto key = args.Convert2ConvCacheKey<T>();
auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
SearchAlgorithmBase<PerfT>::kAlgoType);
bool find_in_cache = cache.Find(key);
if (find_in_cache) {
auto t = cache.Get(key);
result.algo = static_cast<AlgoT>(t.algo);
result.workspace_size = t.workspace_size;
result.exhaustive_search = t.exhaustive_search;
}
if (!result.exhaustive_search) {
bool need_update_cache = false;
// In conv2d_tranpose, enable_autotune is set to false because some
// algorithm picked by exhaustive search method produce wrong result.
use_autotune = enable_autotune &&
phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
if (exhaustive_search || use_autotune) {
// Once autotune is enabled, the autotuned result can rewrite the
// previous result in cache found by heuristic method.
result =
SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
args, ctx);
need_update_cache = true;
} else if (!find_in_cache) {
result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
need_update_cache = true;
}
if (need_update_cache) {
phi::autotune::ConvAutoTuneResult node(
static_cast<int64_t>(result.algo),
result.workspace_size,
exhaustive_search || use_autotune);
cache.Set(key, node);
}
}
}
VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
<< "] exhaustive_search=" << exhaustive_search
<< ", use_autotune=" << use_autotune
<< ", deterministic=" << deterministic
<< ", choose algo=" << result.algo
<< ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
return result;
}
static void SetConvMathType(const phi::GPUContext& ctx,
cudnnDataType_t dtype,
const platform::ConvolutionDescriptor& cdesc) {
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
VLOG(5) << "Enable Tensor Core for FLOAT16";
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
} else if (ctx.GetComputeCapability() >= 80 &&
dtype == CUDNN_DATA_BFLOAT16) {
VLOG(5) << "Enable Tensor Core for BFLOAT16";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_TENSOR_OP_MATH));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
} else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_FMA_MATH));
#endif // CUDA_VERSION >= 11000
} else {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
cdesc.desc(), CUDNN_DEFAULT_MATH));
}
#endif
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -56,6 +56,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context, ...@@ -56,6 +56,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
out_t.device(place) = in_t.slice(offsets, extents); out_t.device(place) = in_t.slice(offsets, extents);
} }
template <typename PerfT>
struct SearchAlgorithm {};
template <> template <>
struct SearchAlgorithm<miopenConvFwdAlgorithm_t> { struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
using perf_t = miopenConvAlgoPerf_t; using perf_t = miopenConvAlgoPerf_t;
......
...@@ -56,12 +56,14 @@ struct hash<std::vector<T>> { ...@@ -56,12 +56,14 @@ struct hash<std::vector<T>> {
namespace phi { namespace phi {
namespace autotune { namespace autotune {
struct DnnNode { struct ConvAutoTuneResult {
DnnNode() {} ConvAutoTuneResult() {}
explicit DnnNode(int64_t a, size_t size) : algo(a), workspace_size(size) {} ConvAutoTuneResult(int64_t a, size_t size, bool search)
: algo(a), workspace_size(size), exhaustive_search(search) {}
int64_t algo; int64_t algo;
size_t workspace_size = 0; size_t workspace_size = 0;
bool exhaustive_search = false;
}; };
template <typename... Args> template <typename... Args>
...@@ -73,40 +75,41 @@ size_t GetKey(Args&&... args) { ...@@ -73,40 +75,41 @@ size_t GetKey(Args&&... args) {
struct ConvCacheKey { struct ConvCacheKey {
ConvCacheKey() {} ConvCacheKey() {}
explicit ConvCacheKey(const std::vector<int64_t>& x_dims, ConvCacheKey(const std::vector<int64_t>& arg_x_dims,
const std::vector<int64_t>& w_dims, const std::vector<int64_t>& arg_w_dims,
const std::vector<int>& strides, const std::vector<int>& arg_strides,
const std::vector<int>& paddings, const std::vector<int>& arg_paddings,
const std::vector<int>& dilations, const std::vector<int>& arg_dilations,
phi::DataType dtype, phi::DataType arg_dtype,
int groups, int arg_groups,
int64_t data_layout) int64_t arg_data_layout)
: x_dims_(x_dims), : x_dims(arg_x_dims),
w_dims_(w_dims), w_dims(arg_w_dims),
strides_(strides), strides(arg_strides),
paddings_(paddings), paddings(arg_paddings),
dilations_(dilations), dilations(arg_dilations),
dtype_(dtype), dtype(arg_dtype),
groups_(groups), groups(arg_groups),
data_layout_(data_layout) {} data_layout(arg_data_layout) {}
size_t hash_value() const { size_t hash_value() const {
return GetKey(x_dims_, return GetKey(x_dims,
w_dims_, w_dims,
strides_, strides,
paddings_, paddings,
dilations_, dilations,
static_cast<int64_t>(dtype_), static_cast<int64_t>(dtype),
groups_, groups,
data_layout_); data_layout);
} }
std::vector<int64_t> x_dims_;
std::vector<int64_t> w_dims_; std::vector<int64_t> x_dims;
std::vector<int> strides_; std::vector<int64_t> w_dims;
std::vector<int> paddings_; std::vector<int> strides;
std::vector<int> dilations_; std::vector<int> paddings;
phi::DataType dtype_; std::vector<int> dilations;
int groups_; phi::DataType dtype;
int64_t data_layout_; int groups;
int64_t data_layout;
}; };
struct ConvCacheKeyHash { struct ConvCacheKeyHash {
...@@ -118,14 +121,14 @@ struct ConvCacheKeyHash { ...@@ -118,14 +121,14 @@ struct ConvCacheKeyHash {
struct ConvCacheKeyEqual { struct ConvCacheKeyEqual {
size_t operator()(const ConvCacheKey& first, size_t operator()(const ConvCacheKey& first,
const ConvCacheKey& second) const { const ConvCacheKey& second) const {
if (first.x_dims_ != second.x_dims_) return false; if (first.x_dims != second.x_dims) return false;
if (first.w_dims_ != second.w_dims_) return false; if (first.w_dims != second.w_dims) return false;
if (first.strides_ != second.strides_) return false; if (first.strides != second.strides) return false;
if (first.paddings_ != second.paddings_) return false; if (first.paddings != second.paddings) return false;
if (first.dilations_ != second.dilations_) return false; if (first.dilations != second.dilations) return false;
if (first.dtype_ != second.dtype_) return false; if (first.dtype != second.dtype) return false;
if (first.groups_ != second.groups_) return false; if (first.groups != second.groups) return false;
if (first.data_layout_ != second.data_layout_) return false; if (first.data_layout != second.data_layout) return false;
return true; return true;
} }
...@@ -135,7 +138,7 @@ class CudnnAlgorithmsCacheMap { ...@@ -135,7 +138,7 @@ class CudnnAlgorithmsCacheMap {
public: public:
CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); } CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); }
DnnNode Get(const ConvCacheKey& key) { ConvAutoTuneResult Get(const ConvCacheKey& key) {
std::lock_guard<std::mutex> lock(*cache_mutex_); std::lock_guard<std::mutex> lock(*cache_mutex_);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
hash_.find(key), hash_.find(key),
...@@ -163,7 +166,7 @@ class CudnnAlgorithmsCacheMap { ...@@ -163,7 +166,7 @@ class CudnnAlgorithmsCacheMap {
cache_misses_ = 0; cache_misses_ = 0;
} }
void Set(const ConvCacheKey& key, DnnNode algo) { void Set(const ConvCacheKey& key, ConvAutoTuneResult algo) {
std::lock_guard<std::mutex> lock(*cache_mutex_); std::lock_guard<std::mutex> lock(*cache_mutex_);
if (hash_.size() > static_cast<size_t>(FLAGS_search_cache_max_number)) { if (hash_.size() > static_cast<size_t>(FLAGS_search_cache_max_number)) {
hash_.clear(); hash_.clear();
...@@ -188,7 +191,10 @@ class CudnnAlgorithmsCacheMap { ...@@ -188,7 +191,10 @@ class CudnnAlgorithmsCacheMap {
int64_t Size() const { return hash_.size(); } int64_t Size() const { return hash_.size(); }
private: private:
std::unordered_map<ConvCacheKey, DnnNode, ConvCacheKeyHash, ConvCacheKeyEqual> std::unordered_map<ConvCacheKey,
ConvAutoTuneResult,
ConvCacheKeyHash,
ConvCacheKeyEqual>
hash_; hash_;
std::shared_ptr<std::mutex> cache_mutex_; std::shared_ptr<std::mutex> cache_mutex_;
...@@ -289,19 +295,8 @@ class AutoTuneCache { ...@@ -289,19 +295,8 @@ class AutoTuneCache {
return auto_tune_map_[static_cast<int64_t>(algo_type)]; return auto_tune_map_[static_cast<int64_t>(algo_type)];
} }
CudnnAlgorithmsCacheMap& GetConvForward() { CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
return cudnn_auto_tune_map_[static_cast<int64_t>( return cudnn_auto_tune_map_[static_cast<int64_t>(algo_type)];
AlgorithmType::kConvForward)];
}
CudnnAlgorithmsCacheMap& GetConvBackwardData() {
return cudnn_auto_tune_map_[static_cast<int64_t>(
AlgorithmType::kConvBackwardData)];
}
CudnnAlgorithmsCacheMap& GetConvBackwardFilter() {
return cudnn_auto_tune_map_[static_cast<int64_t>(
AlgorithmType::kConvBackwardFilter)];
} }
AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); } AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
......
...@@ -25,7 +25,8 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 }; ...@@ -25,7 +25,8 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
TEST(AlgosCache, AlgosCache) { TEST(AlgosCache, AlgosCache) {
auto autotune_cache = phi::autotune::AutoTuneCache::Instance(); auto autotune_cache = phi::autotune::AutoTuneCache::Instance();
auto& cache = autotune_cache.GetConvForward(); auto& cache =
autotune_cache.GetConv(phi::autotune::AlgorithmType::kConvForward);
std::vector<int64_t> x_shape = {4, 224, 224, 3}; std::vector<int64_t> x_shape = {4, 224, 224, 3};
std::vector<int64_t> w_shape = {32, 3, 3, 3}; std::vector<int64_t> w_shape = {32, 3, 3, 3};
...@@ -37,7 +38,8 @@ TEST(AlgosCache, AlgosCache) { ...@@ -37,7 +38,8 @@ TEST(AlgosCache, AlgosCache) {
phi::autotune::ConvCacheKey key( phi::autotune::ConvCacheKey key(
x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0); x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0);
EXPECT_EQ(cache.Find(key), false); EXPECT_EQ(cache.Find(key), false);
phi::autotune::DnnNode node(static_cast<int64_t>(ConvAlgos::GEMMKernel), 0); phi::autotune::ConvAutoTuneResult node(
static_cast<int64_t>(ConvAlgos::GEMMKernel), 0, false);
cache.Set(key, node); cache.Set(key, node);
EXPECT_EQ(cache.Size(), 1); EXPECT_EQ(cache.Size(), 1);
EXPECT_EQ(cache.Find(key), true); EXPECT_EQ(cache.Find(key), true);
...@@ -48,8 +50,8 @@ TEST(AlgosCache, AlgosCache) { ...@@ -48,8 +50,8 @@ TEST(AlgosCache, AlgosCache) {
phi::autotune::ConvCacheKey key1( phi::autotune::ConvCacheKey key1(
x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1); x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1);
EXPECT_EQ(cache.Find(key1), false); EXPECT_EQ(cache.Find(key1), false);
phi::autotune::DnnNode node1(static_cast<int64_t>(ConvAlgos::CuDNNKernel_1), phi::autotune::ConvAutoTuneResult node1(
0); static_cast<int64_t>(ConvAlgos::CuDNNKernel_1), 0, false);
cache.Set(key1, node1); cache.Set(key1, node1);
EXPECT_EQ(cache.Size(), 2); EXPECT_EQ(cache.Size(), 2);
EXPECT_EQ(cache.CacheHits(), 1); EXPECT_EQ(cache.CacheHits(), 1);
......
...@@ -336,7 +336,7 @@ void ConvCudnnGradGradKernel( ...@@ -336,7 +336,7 @@ void ConvCudnnGradGradKernel(
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result1 = search1::Find<T>(args1, exhaustive_search, false, ctx); fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo); workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
#endif #endif
} }
...@@ -364,7 +364,7 @@ void ConvCudnnGradGradKernel( ...@@ -364,7 +364,7 @@ void ConvCudnnGradGradKernel(
#else #else
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result2 = search2::Find<T>(args2, exhaustive_search, false, ctx); fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo)); workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
#endif #endif
...@@ -394,7 +394,7 @@ void ConvCudnnGradGradKernel( ...@@ -394,7 +394,7 @@ void ConvCudnnGradGradKernel(
using search3 = using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = filter_result =
search3::Find<T>(args3, exhaustive_search, deterministic, ctx); search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
#endif #endif
...@@ -424,7 +424,7 @@ void ConvCudnnGradGradKernel( ...@@ -424,7 +424,7 @@ void ConvCudnnGradGradKernel(
using search4 = using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
data_result = data_result =
search4::Find<T>(args4, exhaustive_search, deterministic, ctx); search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, data_result.algo)); workspace_size, search4::GetWorkspaceSize(args4, data_result.algo));
#endif #endif
......
...@@ -373,7 +373,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -373,7 +373,7 @@ void ConvCudnnGradKernel(const Context& ctx,
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx); bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size); workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
#endif #endif
} }
...@@ -402,7 +402,7 @@ void ConvCudnnGradKernel(const Context& ctx, ...@@ -402,7 +402,7 @@ void ConvCudnnGradKernel(const Context& ctx,
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = filter_result =
search2::Find<T>(args2, exhaustive_search, deterministic, ctx); search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
VLOG(3) << "filter algo: " << filter_result.algo << ", time " VLOG(3) << "filter algo: " << filter_result.algo << ", time "
<< filter_result.time; << filter_result.time;
workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size); workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size);
......
...@@ -25,7 +25,6 @@ ...@@ -25,7 +25,6 @@
#endif #endif
#include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
...@@ -56,8 +55,7 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -56,8 +55,7 @@ void ConvCudnnKernel(const Context& ctx,
bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t; bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
bool deterministic = FLAGS_cudnn_deterministic; bool deterministic = FLAGS_cudnn_deterministic;
auto exhaustive_deterministic = exhaustive_search && deterministic; PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
PADDLE_ENFORCE_EQ(exhaustive_deterministic,
false, false,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"Cann't set exhaustive_search True and " "Cann't set exhaustive_search True and "
...@@ -315,7 +313,7 @@ void ConvCudnnKernel(const Context& ctx, ...@@ -315,7 +313,7 @@ void ConvCudnnKernel(const Context& ctx,
paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result; paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
using search = using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx); fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
workspace_size = fwd_result.workspace_size; workspace_size = fwd_result.workspace_size;
#endif #endif
......
...@@ -230,7 +230,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -230,7 +230,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search1::Find<T>(args1, false, deterministic, ctx); fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo)); workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
#endif #endif
...@@ -257,7 +257,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx, ...@@ -257,7 +257,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
#else #else
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search2::Find<T>(args2, false, deterministic, ctx); filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo)); workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
#endif #endif
...@@ -710,7 +710,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -710,7 +710,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else #else
using search1 = using search1 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result1 = search1::Find<T>(args1, false, deterministic, ctx); bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo); workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
#endif #endif
...@@ -734,7 +734,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -734,7 +734,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else #else
using search2 = using search2 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result2 = search2::Find<T>(args2, false, deterministic, ctx); bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo)); workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
#endif #endif
...@@ -761,7 +761,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -761,7 +761,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else #else
using search3 = using search3 =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
filter_result = search3::Find<T>(args3, false, deterministic, ctx); filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo)); workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
#endif #endif
...@@ -789,7 +789,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel( ...@@ -789,7 +789,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
#else #else
using search4 = using search4 =
paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
fwd_result = search4::Find<T>(args4, false, deterministic, ctx); fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
workspace_size = std::max( workspace_size = std::max(
workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo)); workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
#endif #endif
......
...@@ -230,7 +230,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx, ...@@ -230,7 +230,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result; paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
using search = using search =
paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>; paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
bwd_result = search::Find<T>(args, false, deterministic, ctx); bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
workspace_size = workspace_size =
std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo)); std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
#endif #endif
......
...@@ -72,15 +72,20 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result): ...@@ -72,15 +72,20 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result):
""" """
pr_gpu_time = pr_data.get("gpu_time") pr_gpu_time = pr_data.get("gpu_time")
develop_gpu_time = develop_data.get("gpu_time") develop_gpu_time = develop_data.get("gpu_time")
gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time if develop_gpu_time != 0.0:
gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time
gpu_time_diff_str = "{:.5f}".format(gpu_time_diff * 100)
else:
gpu_time_diff = None
gpu_time_diff_str = ""
pr_total_time = pr_data.get("total") pr_total_time = pr_data.get("total")
develop_total_time = develop_data.get("total") develop_total_time = develop_data.get("total")
total_time_diff = (pr_total_time - develop_total_time) / develop_total_time total_time_diff = (pr_total_time - develop_total_time) / develop_total_time
logging.info("------ OP: %s ------" % case_name) logging.info("------ OP: %s ------" % case_name)
logging.info("GPU time change: %.5f%% (develop: %.7f -> PR: %.7f)" % logging.info("GPU time change: %s (develop: %.7f -> PR: %.7f)" %
(gpu_time_diff * 100, develop_gpu_time, pr_gpu_time)) (gpu_time_diff_str, develop_gpu_time, pr_gpu_time))
logging.info("Total time change: %.5f%% (develop: %.7f -> PR: %.7f)" % logging.info("Total time change: %.5f%% (develop: %.7f -> PR: %.7f)" %
(total_time_diff * 100, develop_total_time, pr_total_time)) (total_time_diff * 100, develop_total_time, pr_total_time))
logging.info("backward: %s" % pr_result.get("backward")) logging.info("backward: %s" % pr_result.get("backward"))
...@@ -196,7 +201,8 @@ if __name__ == "__main__": ...@@ -196,7 +201,8 @@ if __name__ == "__main__":
args.develop_logs_dir) args.develop_logs_dir)
check_path_exists(args.pr_logs_dir) check_path_exists(args.pr_logs_dir)
for log_file in os.listdir(args.pr_logs_dir): pr_log_files = os.listdir(args.pr_logs_dir)
for log_file in sorted(pr_log_files):
develop_result = develop_result_dict.get(log_file) develop_result = develop_result_dict.get(log_file)
pr_result = parse_log_file(os.path.join(args.pr_logs_dir, log_file)) pr_result = parse_log_file(os.path.join(args.pr_logs_dir, log_file))
if develop_result is None or pr_result is None: if develop_result is None or pr_result is None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册