Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b4adbe5c
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b4adbe5c
编写于
4月 19, 2022
作者:
Y
Yiqun Liu
提交者:
GitHub
4月 19, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Cherry-pick 2.3] Autotune the workspace and kernel choosing of conv (#41833)
Cherry-pick #40338 #41741 #41313
上级
a9d8b947
变更
28
隐藏空白更改
内联
并排
Showing
28 changed file
with
1178 addition
and
839 deletion
+1178
-839
paddle/fluid/eager/CMakeLists.txt
paddle/fluid/eager/CMakeLists.txt
+1
-1
paddle/fluid/framework/conv_search_cache.h
paddle/fluid/framework/conv_search_cache.h
+0
-1
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+2
-2
paddle/fluid/operators/conv_base_helper.h
paddle/fluid/operators/conv_base_helper.h
+113
-0
paddle/fluid/operators/conv_cudnn_helper.h
paddle/fluid/operators/conv_cudnn_helper.h
+594
-443
paddle/fluid/operators/conv_cudnn_op_cache.h
paddle/fluid/operators/conv_cudnn_op_cache.h
+1
-1
paddle/fluid/operators/conv_miopen_helper.h
paddle/fluid/operators/conv_miopen_helper.h
+3
-69
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+0
-2
paddle/fluid/platform/device/gpu/gpu_info.cc
paddle/fluid/platform/device/gpu/gpu_info.cc
+4
-0
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+4
-3
paddle/fluid/platform/flags.cc
paddle/fluid/platform/flags.cc
+12
-4
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+2
-4
paddle/phi/backends/gpu/gpu_context.cc
paddle/phi/backends/gpu/gpu_context.cc
+37
-3
paddle/phi/backends/gpu/gpu_context.h
paddle/phi/backends/gpu/gpu_context.h
+8
-8
paddle/phi/kernels/CMakeLists.txt
paddle/phi/kernels/CMakeLists.txt
+9
-5
paddle/phi/kernels/autotune/CMakeLists.txt
paddle/phi/kernels/autotune/CMakeLists.txt
+5
-4
paddle/phi/kernels/autotune/cache.cc
paddle/phi/kernels/autotune/cache.cc
+37
-0
paddle/phi/kernels/autotune/cache.h
paddle/phi/kernels/autotune/cache.h
+56
-40
paddle/phi/kernels/autotune/cache_test.cc
paddle/phi/kernels/autotune/cache_test.cc
+1
-1
paddle/phi/kernels/autotune/switch_autotune.cc
paddle/phi/kernels/autotune/switch_autotune.cc
+74
-0
paddle/phi/kernels/autotune/switch_autotune.h
paddle/phi/kernels/autotune/switch_autotune.h
+32
-62
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+34
-37
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+22
-21
paddle/phi/kernels/gpudnn/conv_kernel.cu
paddle/phi/kernels/gpudnn/conv_kernel.cu
+8
-10
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+51
-56
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+8
-5
paddle/phi/kernels/impl/conv_cudnn_impl.h
paddle/phi/kernels/impl/conv_cudnn_impl.h
+1
-1
python/paddle/fluid/tests/unittests/test_switch_autotune.py
python/paddle/fluid/tests/unittests/test_switch_autotune.py
+59
-56
未找到文件。
paddle/fluid/eager/CMakeLists.txt
浏览文件 @
b4adbe5c
...
@@ -15,7 +15,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
...
@@ -15,7 +15,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
add_subdirectory
(
pylayer
)
add_subdirectory
(
pylayer
)
cc_library
(
grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator
)
cc_library
(
grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator
)
add_dependencies
(
grad_tensor_holder eager_final_state_codegen
)
add_dependencies
(
grad_tensor_holder eager_final_state_codegen
)
cc_library
(
backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info
)
cc_library
(
backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info
switch_autotune
)
endif
()
endif
()
cc_library
(
grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor
)
cc_library
(
grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor
)
...
...
paddle/fluid/framework/conv_search_cache.h
浏览文件 @
b4adbe5c
...
@@ -16,7 +16,6 @@ limitations under the License. */
...
@@ -16,7 +16,6 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
b4adbe5c
...
@@ -9,8 +9,8 @@ cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_f
...
@@ -9,8 +9,8 @@ cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_f
add_subdirectory
(
jit
)
add_subdirectory
(
jit
)
cc_library
(
amp SRCS amp_auto_cast.cc DEPS layer var_helper
)
cc_library
(
amp SRCS amp_auto_cast.cc DEPS layer var_helper
)
cc_library
(
tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper
)
cc_library
(
tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper
)
cc_library
(
basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator
)
cc_library
(
basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator
switch_autotune
)
cc_library
(
engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator
)
cc_library
(
engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator
switch_autotune
)
cc_library
(
imperative_profiler SRCS profiler.cc DEPS flags
)
cc_library
(
imperative_profiler SRCS profiler.cc DEPS flags
)
if
(
NOT WIN32
)
if
(
NOT WIN32
)
if
(
WITH_NCCL OR WITH_RCCL
)
if
(
WITH_NCCL OR WITH_RCCL
)
...
...
paddle/fluid/operators/conv_base_helper.h
0 → 100644
浏览文件 @
b4adbe5c
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <array>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/autotune/cache.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
DataLayout
=
platform
::
DataLayout
;
using
framework
::
AlgorithmsCache
;
using
framework
::
ConvSearchCache
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
// As the basic for SearchAlgorithm struct.
template
<
typename
PerfT
>
struct
SearchAlgorithm
{};
// As the container of searchAlgorithm::Find() result.
template
<
typename
AlgoT
>
struct
SearchResult
{
SearchResult
()
{}
explicit
SearchResult
(
AlgoT
a
)
:
algo
(
a
)
{}
AlgoT
algo
=
static_cast
<
AlgoT
>
(
0
);
float
time
=
-
1.
f
;
size_t
workspace_size
=
0
;
};
template
<
typename
T
>
static
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
std
::
vector
<
T
>&
v
)
{
out
<<
"["
;
for
(
auto
const
&
tmp
:
v
)
out
<<
tmp
<<
","
;
out
<<
"]"
;
return
out
;
}
// As the container of conv relevant descriptors.
template
<
typename
HandleT
,
typename
DataT
>
struct
ConvArgsBase
{
HandleT
handle
;
platform
::
TensorDescriptor
idesc
,
odesc
;
platform
::
FilterDescriptor
wdesc
;
platform
::
ConvolutionDescriptor
cdesc
;
const
framework
::
Tensor
*
x
,
*
w
,
*
o
;
DataT
cudnn_dtype
;
// strides
std
::
vector
<
int
>
s
;
// paddings
std
::
vector
<
int
>
p
;
// dilations
std
::
vector
<
int
>
d
;
ConvArgsBase
(
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
w
,
const
framework
::
Tensor
*
o
,
const
std
::
vector
<
int
>
s
,
const
std
::
vector
<
int
>
p
,
const
std
::
vector
<
int
>
d
,
DataT
dtype
)
:
x
(
x
),
w
(
w
),
o
(
o
),
s
(
s
),
p
(
p
),
d
(
d
),
cudnn_dtype
(
dtype
)
{}
template
<
typename
T
>
size_t
GetCacheKey
()
const
{
auto
x_shape
=
phi
::
vectorize
(
x
->
dims
());
auto
w_shape
=
phi
::
vectorize
(
w
->
dims
());
VLOG
(
10
)
<<
"[ConvArgs] x_dims="
<<
x_shape
<<
", w_dims="
<<
w_shape
<<
", strides="
<<
s
<<
", paddings="
<<
p
<<
", dilations="
<<
d
;
return
phi
::
autotune
::
ConvKey
(
x_shape
,
w_shape
,
p
,
s
,
d
,
paddle
::
experimental
::
CppTypeToDataType
<
T
>::
Type
());
}
};
static
inline
void
GetNCDHW
(
const
framework
::
DDim
&
dims
,
const
DataLayout
&
layout
,
int
*
N
,
int
*
C
,
int
*
D
,
int
*
H
,
int
*
W
)
{
*
N
=
dims
[
0
];
*
C
=
layout
==
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
int
i
=
layout
==
DataLayout
::
kNCHW
?
0
:
1
;
if
(
dims
.
size
()
==
5
)
{
*
D
=
dims
[
2
-
i
];
*
H
=
dims
[
3
-
i
];
*
W
=
dims
[
4
-
i
];
}
else
{
*
D
=
1
;
*
H
=
dims
[
2
-
i
];
*
W
=
dims
[
3
-
i
];
}
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/conv_cudnn_helper.h
浏览文件 @
b4adbe5c
...
@@ -14,44 +14,17 @@ limitations under the License. */
...
@@ -14,44 +14,17 @@ limitations under the License. */
#pragma once
#pragma once
#include <algorithm>
#include "paddle/fluid/operators/conv_base_helper.h"
#include <array>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
ConvArgs
=
ConvArgsBase
<
cudnnHandle_t
,
cudnnDataType_t
>
;
using
DataLayout
=
platform
::
DataLayout
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
framework
::
AlgorithmsCache
;
static
inline
void
GetNCDHW
(
const
framework
::
DDim
&
dims
,
const
DataLayout
&
layout
,
int
*
N
,
int
*
C
,
int
*
D
,
int
*
H
,
int
*
W
)
{
*
N
=
dims
[
0
];
*
C
=
layout
==
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
int
i
=
layout
==
DataLayout
::
kNCHW
?
0
:
1
;
if
(
dims
.
size
()
==
5
)
{
*
D
=
dims
[
2
-
i
];
*
H
=
dims
[
3
-
i
];
*
W
=
dims
[
4
-
i
];
}
else
{
*
D
=
1
;
*
H
=
dims
[
2
-
i
];
*
W
=
dims
[
3
-
i
];
}
}
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
...
@@ -68,121 +41,117 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
...
@@ -68,121 +41,117 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
extents
[
i
]
=
new_out_dims
[
i
];
extents
[
i
]
=
new_out_dims
[
i
];
}
}
int
start
;
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
start
=
starts
[
i
];
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
}
start
=
std
::
max
(
start
,
0
);
start
=
std
::
max
(
start
,
0
);
offsets
[
axes
[
i
]]
=
start
;
offsets
[
axes
[
i
]]
=
start
;
}
}
auto
in_t
=
auto
in_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
input
);
*
input
);
auto
out_t
=
auto
out_t
=
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
framework
::
EigenTensor
<
T
,
D
,
Eigen
::
RowMajor
,
Eigen
::
DenseIndex
>::
From
(
*
out
,
new_out_dims
);
*
out
,
new_out_dims
);
EigenSlice
<
std
::
decay_t
<
decltype
(
place
)
>
,
T
,
D
>::
Eval
(
place
,
out_t
,
in_t
,
offsets
,
extents
);
phi
::
funcs
::
EigenSlice
<
std
::
decay_t
<
decltype
(
place
)
>
,
T
,
D
>::
Eval
(
place
,
out_t
,
in_t
,
offsets
,
extents
);
}
}
template
<
typename
T
>
static
inline
double
ToMegaBytes
(
size_t
bytes
)
{
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
std
::
vector
<
T
>&
v
)
{
return
static_cast
<
double
>
(
bytes
)
/
(
1
<<
20
);
out
<<
"["
;
for
(
auto
const
&
tmp
:
v
)
out
<<
tmp
<<
","
;
out
<<
"]"
;
return
out
;
}
}
inline
int
MaxBwdFilterAlgos
(
cudnnHandle_t
cudnn_handle
)
{
static
inline
bool
UseFixedWorkspace
()
{
int
max_algos
=
0
;
return
FLAGS_conv_workspace_size_limit
>=
0
;
#if CUDNN_VERSION_MIN(7, 0, 1)
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
(
cudnn_handle
,
&
max_algos
));
#endif
return
max_algos
;
}
}
template
<
typename
PerfType
,
typename
AlgoType
>
static
size_t
CalcWorkspaceLimitInBytes
(
bool
use_fixed_workspace
)
{
void
ChooseAlgoByWorkspace
(
PerfType
*
perf_results
,
size_t
perf_num
,
if
(
!
use_fixed_workspace
)
{
size_t
workspace_byte
,
AlgoType
*
algo
)
{
int
device_id
=
platform
::
GetCurrentDeviceId
();
for
(
size_t
i
=
0
;
i
<
perf_num
;
++
i
)
{
int64_t
allocated
=
memory
::
StatGetCurrentValue
(
"Allocated"
,
device_id
);
auto
result
=
perf_results
[
i
];
int64_t
reserved
=
memory
::
StatGetCurrentValue
(
"Reserved"
,
device_id
);
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
int64_t
availble
=
platform
::
GpuAvailableMemToAlloc
();
result
.
memory
<
workspace_byte
)
{
VLOG
(
3
)
<<
"[memory] allocated="
<<
ToMegaBytes
(
allocated
)
*
algo
=
result
.
algo
;
<<
" MB, reserved="
<<
ToMegaBytes
(
reserved
)
VLOG
(
3
)
<<
" algo: "
<<
result
.
algo
<<
", time: "
<<
result
.
time
<<
" MB, available_to_alloc="
<<
ToMegaBytes
(
availble
)
<<
" MB."
;
<<
" ms, wksp = "
<<
result
.
memory
return
std
::
max
(
availble
,
reserved
-
allocated
);
<<
", status = "
<<
result
.
status
;
}
else
{
return
;
return
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
}
}
}
VLOG
(
3
)
<<
"Can not find alog that requires memory < "
<<
static_cast
<
double
>
(
workspace_byte
)
/
(
1
<<
20
)
<<
" MB"
;
}
}
template
<
typename
PerfType
,
typename
AlgoType
>
template
<
typename
PerfT
>
void
ChooseAlgo
(
const
std
::
vector
<
PerfType
>&
perf_results
,
std
::
string
GetPerfResultString
(
std
::
string
prefix
,
size_t
workspace_byte
,
AlgoType
*
algo
)
{
const
std
::
vector
<
PerfT
>&
perf_results
,
VLOG
(
3
)
<<
"=========BwdFilterAlgo Perf result========="
;
int
actual_algo_count
,
size_t
workspace_limit
)
{
for
(
const
auto
&
result
:
perf_results
)
{
std
::
ostringstream
out
;
auto
math_type_str
=
"False"
;
out
<<
prefix
<<
" (workspace limit="
<<
ToMegaBytes
(
workspace_limit
)
if
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
{
<<
" MB):
\n
"
;
math_type_str
=
"True"
;
for
(
int
i
=
0
;
i
<
actual_algo_count
;
++
i
)
{
}
const
auto
&
result
=
perf_results
[
i
];
VLOG
(
3
)
<<
" algo: "
<<
result
.
algo
<<
", TensorCore: "
<<
math_type_str
auto
math_type_str
=
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
?
"T"
:
"F"
;
<<
", time: "
<<
result
.
time
<<
" ms"
out
<<
" algo="
<<
result
.
algo
<<
": tensor_core="
<<
math_type_str
<<
", wksp = "
<<
result
.
memory
<<
", status = "
<<
result
.
status
;
<<
", time="
<<
result
.
time
<<
" ms, memory="
<<
ToMegaBytes
(
result
.
memory
)
<<
" MB, status="
<<
result
.
status
<<
"
\n
"
;
}
}
return
out
.
str
();
}
for
(
size_t
i
=
0
;
i
!=
perf_results
.
size
();
++
i
)
{
// Choose an algorithm which has the minimize time cost and less memory.
const
auto
&
result
=
perf_results
[
i
];
// NOTE: perf_results is ordered by time.
template
<
typename
PerfT
,
typename
AlgoT
>
void
ChooseAlgoByWorkspace
(
const
std
::
vector
<
PerfT
>&
perf_results
,
size_t
workspace_limit
,
SearchResult
<
AlgoT
>*
search_result
)
{
int
best_algo_idx
=
-
1
;
for
(
size_t
i
=
0
;
i
<
perf_results
.
size
();
++
i
)
{
auto
result
=
perf_results
[
i
];
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
(
result
.
memory
<=
workspace_byte
))
{
result
.
memory
<
workspace_limit
)
{
if
((
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
&&
if
(
best_algo_idx
==
-
1
)
{
(
i
!=
perf_results
.
size
()
-
1
))
{
// The algorithm which has minimize time cost and need a workspace_size
const
auto
&
next_result
=
perf_results
[
i
+
1
];
// fitting the workspace_limit constraint.
if
(
next_result
.
status
==
CUDNN_STATUS_SUCCESS
&&
best_algo_idx
=
i
;
next_result
.
algo
==
result
.
algo
&&
// Each perf_results[i].time is set to be -1 in heuristic search.
next_result
.
memory
==
result
.
memory
&&
if
(
perf_results
[
best_algo_idx
].
time
<
0
)
{
next_result
.
mathType
!=
CUDNN_TENSOR_OP_MATH
&&
break
;
next_result
.
time
<
1.01
*
result
.
time
)
{
}
// Skip over this result- it's not really a Tensor Core algo.
}
else
{
// Because it is only 1% performance difference.
float
best_algo_time
=
perf_results
[
best_algo_idx
].
time
;
// Prefer to choose the next equivalent non-Tensor Core algo.
if
((
result
.
time
-
best_algo_time
)
/
best_algo_time
<
0.01
)
{
continue
;
best_algo_idx
=
(
result
.
memory
<
perf_results
[
best_algo_idx
].
memory
)
?
i
:
best_algo_idx
;
break
;
}
}
}
}
*
algo
=
result
.
algo
;
auto
math_type_str
=
"0"
;
if
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
{
math_type_str
=
"1"
;
}
VLOG
(
3
)
<<
" choose algo: "
<<
result
.
algo
<<
", TC: "
<<
math_type_str
<<
", time: "
<<
result
.
time
<<
" ms"
<<
", wksp = "
<<
result
.
memory
<<
", status = "
<<
result
.
status
;
break
;
}
}
}
}
if
(
best_algo_idx
!=
-
1
)
{
search_result
->
algo
=
perf_results
[
best_algo_idx
].
algo
;
search_result
->
time
=
perf_results
[
best_algo_idx
].
time
;
search_result
->
workspace_size
=
perf_results
[
best_algo_idx
].
memory
;
}
else
{
VLOG
(
3
)
<<
"Can not find an algorithm that requires memory < "
<<
ToMegaBytes
(
workspace_limit
)
<<
" MB"
;
}
}
}
using
framework
::
ConvSearchCache
;
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
static
void
SetConvMathType
(
const
phi
::
GPUContext
&
ctx
,
cudnnDataType_t
dtype
,
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
const
platform
::
ConvolutionDescriptor
&
cdesc
)
{
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
auto
&
dev_ctx
=
ctx
;
if
(
ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
if
(
dev_ctx
.
GetComputeCapability
()
>=
70
&&
dtype
==
CUDNN_DATA_HALF
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
VLOG
(
5
)
<<
"use cudnn_tensor_op_math"
;
#if CUDA_VERSION >= 11000
#if CUDA_VERSION >= 11000
#if CUDNN_VERSION_MIN(8, 1, 0)
#if CUDNN_VERSION_MIN(8, 1, 0)
}
else
if
(
dev_ctx
.
GetComputeCapability
()
>=
80
&&
}
else
if
(
ctx
.
GetComputeCapability
()
>=
80
&&
dtype
==
CUDNN_DATA_BFLOAT16
)
{
dtype
==
CUDNN_DATA_BFLOAT16
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSetConvolutionMathType
(
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
cdesc
.
desc
(),
CUDNN_TENSOR_OP_MATH
));
#endif // CUDNN_VERSION_MIN(8, 1, 0)
#endif // CUDNN_VERSION_MIN(8, 1, 0)
...
@@ -198,411 +167,593 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
...
@@ -198,411 +167,593 @@ static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype,
#endif
#endif
}
}
struct
ConvArgs
{
// cuDNN convolution forward algorithm searcher, consisted of three searching
cudnnHandle_t
handle
;
// modes, namely: deterministic, heuristic and exhaustive_search mode.
platform
::
TensorDescriptor
idesc
,
odesc
;
// As well as one workspace size acquirsition function with respect to
platform
::
FilterDescriptor
wdesc
;
// the chosen alogrithm.
platform
::
ConvolutionDescriptor
cdesc
;
const
framework
::
Tensor
*
x
,
*
w
,
*
o
;
cudnnDataType_t
cudnn_dtype
;
// strides
std
::
vector
<
int
>
s
;
// paddings
std
::
vector
<
int
>
p
;
// dilations
std
::
vector
<
int
>
d
;
ConvArgs
(
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
w
,
const
framework
::
Tensor
*
o
,
const
std
::
vector
<
int
>
s
,
const
std
::
vector
<
int
>
p
,
const
std
::
vector
<
int
>
d
,
cudnnDataType_t
dtype
)
:
x
(
x
),
w
(
w
),
o
(
o
),
s
(
s
),
p
(
p
),
d
(
d
),
cudnn_dtype
(
dtype
)
{}
};
template
<
typename
perf_t
>
struct
SearchAlgorithm
{};
template
<
>
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
{
struct
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
{
using
perf_t
=
cudnnConvolutionFwdAlgoPerf_t
;
using
PerfT
=
cudnnConvolutionFwdAlgoPerf_t
;
using
algo_t
=
cudnnConvolutionFwdAlgo_t
;
using
AlgoT
=
cudnnConvolutionFwdAlgo_t
;
template
<
typename
T
>
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
bool
has_got_workspace_size
=
true
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
algo_t
algo
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
!
exhaustive_search
&&
!
deterministic
)
{
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
();
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t
key
=
args
.
GetCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvForward
();
if
(
cache
.
Find
(
key
))
{
result
.
algo
=
static_cast
<
AlgoT
>
(
cache
.
Get
(
key
));
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
cache
.
Set
(
key
,
static_cast
<
int64_t
>
(
result
.
algo
));
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
GetWorkspaceSize
(
args
,
result
.
algo
))
<<
" MB"
;
return
result
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionFwdAlgo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
algo
,
&
workspace_size
));
return
workspace_size
;
}
private:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
()
{
return
SearchResult
<
AlgoT
>
(
static_cast
<
AlgoT
>
(
1
));
}
// Heuristic search mode, calling the cudnnGetXxxAlgorithm.
static
SearchResult
<
AlgoT
>
FindAlgoHeuristic
(
const
ConvArgs
&
args
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
#if CUDNN_VERSION >= 7001
#if CUDNN_VERSION >= 7001
int
perf_count
;
int
actual_
perf_count
;
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
std
::
unique_ptr
<
perf_t
[]
>
perf_results
(
new
perf_t
[
kNUM_CUDNN_FWD_ALGS
]
);
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm_v7
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
kNUM_CUDNN_FWD_ALGS
,
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
kNUM_CUDNN_FWD_ALGS
,
&
perf_count
,
perf_results
.
get
()));
&
actual_perf_count
,
perf_results
.
data
()));
algo
=
(
perf_results
.
get
())
[
best_algo_idx
].
algo
;
result
.
algo
=
perf_results
[
best_algo_idx
].
algo
;
workspace_size
=
(
perf_results
.
get
())
[
best_algo_idx
].
memory
;
result
.
workspace_size
=
perf_results
[
best_algo_idx
].
memory
;
if
(
workspace_size
>
workspace_size_limit
)
{
if
(
result
.
workspace_size
>
workspace_size_limit
)
{
#if CUDNN_VERSION >= 8000
#if CUDNN_VERSION >= 8000
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
// cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace
<
perf_t
,
algo_t
>
(
perf_results
.
get
(),
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
kNUM_CUDNN_FWD_ALGS
,
&
result
);
workspace_size_limit
,
&
algo
);
#else
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#endif
}
#else
#else
VLOG
(
3
)
<<
"Fallback to non-v7 method to find conv algorithm "
"becasue the workspace size request("
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionForwardAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
workspace_size_limit
,
&
(
result
.
algo
)
));
#endif
#endif
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
}
else
if
(
deterministic
)
{
algo
=
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
1
);
}
else
{
auto
&
dev_ctx
=
ctx
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetForward
());
auto
x_dims
=
phi
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
phi
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"cudnnConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_FWD_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
const_cast
<
T
*>
(
args
.
o
->
data
<
T
>
()),
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"FwdAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
algo
<<
": "
<<
stat
.
status
<<
" "
<<
stat
.
time
<<
" "
<<
stat
.
memory
;
}
return
perf_stat
[
0
].
algo
;
});
}
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
#else
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionForward
WorkspaceSize
(
platform
::
dynload
::
cudnnGetConvolutionForward
Algorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
algo
,
&
workspace_size
));
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
return
workspace_size
;
CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
(
result
.
algo
)));
#endif
return
result
;
}
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
FindAlgoExhaustiveSearch
(
const
ConvArgs
&
args
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
size_t
max_workspace_size
=
GetMaxWorkspaceSize
(
args
,
workspace_size_limit
);
VLOG
(
4
)
<<
"max_workspace_size="
<<
ToMegaBytes
(
max_workspace_size
)
<<
" MB"
;
int
returned_algo_count
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_FWD_ALGS
);
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionForwardAlgorithmEx
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
const_cast
<
T
*>
(
args
.
o
->
data
<
T
>
()),
kNUM_CUDNN_FWD_ALGS
,
&
returned_algo_count
,
perf_results
.
data
(),
workspace_ptr
,
max_workspace_size
));
};
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
max_workspace_size
,
UseFixedWorkspace
());
VLOG
(
4
)
<<
GetPerfResultString
<
PerfT
>
(
"[Exhaustive Search] FwdAlgo Perf result"
,
perf_results
,
returned_algo_count
,
workspace_size_limit
);
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
&
result
);
return
result
;
}
static
size_t
GetMaxWorkspaceSize
(
const
ConvArgs
&
args
,
size_t
workspace_size_limit
)
{
if
(
!
UseFixedWorkspace
())
{
size_t
max_workspace_size
=
0
;
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_FWD_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
auto
status
=
platform
::
dynload
::
cudnnGetConvolutionForwardWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
odesc
.
desc
(),
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
algo
),
&
workspace_size
);
if
(
status
==
CUDNN_STATUS_SUCCESS
&&
workspace_size
<=
workspace_size_limit
)
{
max_workspace_size
=
std
::
max
(
workspace_size
,
max_workspace_size
);
}
}
return
max_workspace_size
;
}
else
{
return
workspace_size_limit
;
}
}
}
};
};
// cuDNN convolution backward data-algorithm searcher, consisting of three
// searching modes, namely: deterministic, heuristic, and exhaustive_search
// mode. Specially, there are 2 pattens of exhaustive search mode, one for
// HALF precision only, one for the rest.
// As well as one workspace size acquirsition function with
// respect to the chosen alogrithm.
template
<
>
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
{
struct
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
{
using
perf_t
=
cudnnConvolutionBwdDataAlgoPerf_t
;
using
PerfT
=
cudnnConvolutionBwdDataAlgoPerf_t
;
using
algo_t
=
cudnnConvolutionBwdDataAlgo_t
;
using
AlgoT
=
cudnnConvolutionBwdDataAlgo_t
;
template
<
typename
T
>
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
algo_t
algo
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
if
(
!
exhaustive_search
&&
!
deterministic
)
{
if
(
deterministic
)
{
result
=
FindAlgoDeterministic
();
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t
key
=
args
.
GetCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvBackwardData
();
if
(
cache
.
Find
(
key
))
{
result
.
algo
=
static_cast
<
AlgoT
>
(
cache
.
Get
(
key
));
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
cache
.
Set
(
key
,
static_cast
<
int64_t
>
(
result
.
algo
));
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
GetWorkspaceSize
(
args
,
result
.
algo
))
<<
" MB"
;
return
result
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdDataAlgo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
algo
,
&
workspace_size
));
return
workspace_size
;
}
private:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
()
{
return
SearchResult
<
AlgoT
>
(
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
);
}
static
SearchResult
<
AlgoT
>
FindAlgoHeuristic
(
const
ConvArgs
&
args
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
#if CUDNN_VERSION >= 7001
#if CUDNN_VERSION >= 7001
int
perf_count
;
int
actual_perf_count
;
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
std
::
unique_ptr
<
perf_t
[]
>
perf_results
(
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
new
perf_t
[
kNUM_CUDNN_BWD_DATA_ALGS
]);
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm_v7
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm_v7
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
kNUM_CUDNN_BWD_DATA_ALGS
,
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
actual_perf_count
,
perf_results
.
data
()));
&
perf_count
,
perf_results
.
get
()));
result
.
algo
=
perf_results
[
best_algo_idx
].
algo
;
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
#if CUDNN_VERSION < 7500
#if CUDNN_VERSION < 7500
int
stride_dim
=
args
.
x
->
dims
().
size
()
-
2
;
int
stride_dim
=
args
.
x
->
dims
().
size
()
-
2
;
bool
blacklist
=
std
::
any_of
(
args
.
s
.
begin
(),
args
.
s
.
begin
()
+
stride_dim
,
bool
blacklist
=
std
::
any_of
(
args
.
s
.
begin
(),
args
.
s
.
begin
()
+
stride_dim
,
[
=
](
int
n
)
{
return
n
!=
1
;
});
[
=
](
int
n
)
{
return
n
!=
1
;
});
if
(
blacklist
&&
(
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
if
(
blacklist
&&
(
perf_results
[
best_algo_idx
].
algo
==
perf_results
[
best_algo_idx
].
algo
)
==
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
||
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING
||
perf_results
[
best_algo_idx
].
algo
==
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
))
{
perf_results
[
best_algo_idx
].
algo
)
==
result
.
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT
))
{
}
algo
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
#endif
#endif
workspace_size
=
GetWorkspaceSize
(
args
,
algo
);
result
.
workspace_size
=
GetWorkspaceSize
(
args
,
result
.
algo
);
if
(
workspace_size
>
workspace_size_limit
)
{
if
(
result
.
workspace_size
>
workspace_size_limit
)
{
has_got_workspace_size
=
false
;
#if CUDNN_VERSION >= 8000
#if CUDNN_VERSION >= 8000
// cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
// cudnnGetConvolutionBackwardDataAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace
<
perf_t
,
algo_t
>
(
perf_results
.
get
(),
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
kNUM_CUDNN_BWD_DATA_ALGS
,
&
result
);
workspace_size_limit
,
&
algo
);
#else
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#endif
}
#else
#else
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataAlgorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
workspace_size_limit
,
&
(
result
.
algo
)
));
#endif
#endif
}
else
if
(
deterministic
)
{
return
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
else
{
auto
&
dev_ctx
=
ctx
;
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
AlgorithmsCache
<
algo_t
>&
algo_cache
=
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardData
());
auto
x_dims
=
phi
::
vectorize
(
args
.
x
->
dims
());
auto
w_dims
=
phi
::
vectorize
(
args
.
w
->
dims
());
VLOG
(
10
)
<<
"cudnnConvolutionFwdAlgoPerf_t"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
algo
=
algo_cache
.
GetAlgorithm
(
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
std
::
array
<
perf_t
,
kNUM_CUDNN_BWD_DATA_ALGS
>
perf_stat
;
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
const_cast
<
T
*>
(
args
.
x
->
data
<
T
>
()),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
perf_stat
.
data
(),
cudnn_workspace_ptr
,
workspace_size_limit
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
workspace_size_limit
);
VLOG
(
3
)
<<
"BwdDataAlgo Perf result: (algo: stat, time, memory)"
;
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
const
auto
&
stat
=
perf_stat
[
i
];
VLOG
(
3
)
<<
stat
.
algo
<<
": "
<<
stat
.
status
<<
" "
<<
stat
.
time
<<
" "
<<
stat
.
memory
;
}
return
perf_stat
[
0
].
algo
;
});
}
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
#else
return
algo
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardData
WorkspaceSize
(
platform
::
dynload
::
cudnnGetConvolutionBackwardData
Algorithm
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
algo
,
&
workspace_size
));
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
return
workspace_size
;
CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
(
result
.
algo
)));
#endif
return
result
;
}
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
FindAlgoExhaustiveSearch
(
const
ConvArgs
&
args
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
size_t
max_workspace_size
=
GetMaxWorkspaceSize
(
args
,
workspace_size_limit
);
VLOG
(
3
)
<<
"max_workspace_size="
<<
ToMegaBytes
(
max_workspace_size
)
<<
" MB"
;
int
returned_algo_count
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_DATA_ALGS
);
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionBackwardDataAlgorithmEx
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
w
->
data
<
T
>
(),
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
const_cast
<
T
*>
(
args
.
x
->
data
<
T
>
()),
kNUM_CUDNN_BWD_DATA_ALGS
,
&
returned_algo_count
,
perf_results
.
data
(),
workspace_ptr
,
max_workspace_size
));
};
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
max_workspace_size
,
UseFixedWorkspace
());
VLOG
(
4
)
<<
GetPerfResultString
<
PerfT
>
(
"[Exhaustive Search] BwdDataAlgo Perf result"
,
perf_results
,
returned_algo_count
,
workspace_size_limit
);
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
&
result
);
return
result
;
}
static
size_t
GetMaxWorkspaceSize
(
const
ConvArgs
&
args
,
size_t
workspace_size_limit
)
{
if
(
!
UseFixedWorkspace
())
{
size_t
max_workspace_size
=
0
;
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_DATA_ALGS
;
++
algo
)
{
size_t
workspace_size
=
0
;
auto
status
=
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
args
.
handle
,
args
.
wdesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
idesc
.
desc
(),
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
algo
),
&
workspace_size
);
if
(
status
==
CUDNN_STATUS_SUCCESS
&&
workspace_size
<=
workspace_size_limit
)
{
max_workspace_size
=
std
::
max
(
workspace_size
,
max_workspace_size
);
}
}
return
max_workspace_size
;
}
else
{
return
workspace_size_limit
;
}
}
}
};
};
// cuDNN convution backward filter-algorithm searcher, consisted of three
// algorithm searching modes, namely: deterministic, heuristic, and
// exhaustive_search mode. As well as one workspace size acquirsition function
// with respect to the chosen alogrithm.
template
<
>
template
<
>
struct
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
{
struct
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
{
using
perf_t
=
cudnnConvolutionBwdFilterAlgoPerf_t
;
using
PerfT
=
cudnnConvolutionBwdFilterAlgoPerf_t
;
using
algo_t
=
cudnnConvolutionBwdFilterAlgo_t
;
using
AlgoT
=
cudnnConvolutionBwdFilterAlgo_t
;
template
<
typename
T
>
template
<
typename
T
>
static
algo_t
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
static
SearchResult
<
AlgoT
>
Find
(
const
ConvArgs
&
args
,
bool
exhaustive_search
,
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
bool
deterministic
,
const
phi
::
GPUContext
&
ctx
)
{
platform
::
CUDAGraphCaptureModeGuard
guard
;
platform
::
CUDAGraphCaptureModeGuard
guard
;
SearchResult
<
AlgoT
>
result
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
size_t
workspace_size_limit
=
FLAGS_conv_workspace_size_limit
*
1024
*
1024
;
size_t
workspace_size
=
0
;
bool
has_got_workspace_size
=
true
;
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
SetConvMathType
(
ctx
,
dtype
,
args
.
cdesc
);
algo_t
algo
;
if
(
deterministic
)
{
if
(
!
exhaustive_search
&&
!
deterministic
)
{
result
=
FindAlgoDeterministic
();
}
else
{
// 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
// 2. Once turning on auto-tune, runn heuristic search(default) before
// auto-tune process, run exhaustive_search during mentioned process.
// 3. After auto-tune process, run cached algorithm if cached, run
// default mode for the rest.
size_t
key
=
args
.
GetCacheKey
<
T
>
();
auto
&
cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
GetConvBackwardFilter
();
if
(
cache
.
Find
(
key
))
{
result
.
algo
=
static_cast
<
AlgoT
>
(
cache
.
Get
(
key
));
}
else
{
bool
use_autotune
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
if
(
exhaustive_search
||
use_autotune
)
{
result
=
FindAlgoExhaustiveSearch
<
T
>
(
args
,
ctx
);
cache
.
Set
(
key
,
static_cast
<
int64_t
>
(
result
.
algo
));
}
else
{
result
=
FindAlgoHeuristic
(
args
,
ctx
);
}
}
}
VLOG
(
3
)
<<
"[cuDNN Convoltion] exhaustive_search="
<<
exhaustive_search
<<
", deterministic="
<<
deterministic
<<
", choose algo="
<<
result
.
algo
<<
", workspace="
<<
ToMegaBytes
(
GetWorkspaceSize
(
args
,
result
.
algo
))
<<
" MB"
;
return
result
;
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
cudnnConvolutionBwdFilterAlgo_t
algo
)
{
platform
::
CUDAGraphCaptureModeGuard
guard
;
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
algo
,
&
workspace_size
));
return
workspace_size
;
}
private:
static
SearchResult
<
AlgoT
>
FindAlgoDeterministic
()
{
return
SearchResult
<
AlgoT
>
(
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
);
}
static
SearchResult
<
AlgoT
>
FindAlgoHeuristic
(
const
ConvArgs
&
args
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
#if CUDNN_VERSION >= 7001
#if CUDNN_VERSION >= 7001
using
perf_t
=
cudnnConvolutionBwdFilterAlgoPerf_t
;
int
actual_perf_count
;
int
perf_count
;
int
best_algo_idx
=
0
;
int
best_algo_idx
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_FILTER_ALGS
);
std
::
unique_ptr
<
perf_t
[]
>
perf_results
(
PADDLE_ENFORCE_GPU_SUCCESS
(
new
perf_t
[
kNUM_CUDNN_BWD_FILTER_ALGS
]);
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm_v7
(
PADDLE_ENFORCE_GPU_SUCCESS
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm_v7
(
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
kNUM_CUDNN_BWD_FILTER_ALGS
,
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
&
actual_perf_count
,
perf_results
.
data
()));
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
kNUM_CUDNN_BWD_FILTER_ALGS
,
result
.
algo
=
perf_results
[
best_algo_idx
].
algo
;
&
perf_count
,
perf_results
.
get
()));
result
.
workspace_size
=
perf_results
[
best_algo_idx
].
memory
;
algo
=
(
perf_results
.
get
())[
best_algo_idx
].
algo
;
workspace_size
=
(
perf_results
.
get
())[
best_algo_idx
].
memory
;
if
(
workspace_size
>
workspace_size_limit
)
{
if
(
result
.
workspace_size
>
workspace_size_limit
)
{
workspace_size
=
workspace_size_limit
;
#if CUDNN_VERSION >= 8000
#if CUDNN_VERSION >= 8000
// cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
// cudnnGetConvolutionBackwardFilterAlgorithm is removed in CUDNN-8
ChooseAlgoByWorkspace
<
perf_t
,
algo_t
>
(
perf_results
.
get
(),
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
result
);
workspace_size_limit
,
&
algo
);
#else
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
#endif
}
#else
#else
VLOG
(
1
)
<<
"Fallback to non-v7 method to find conv algorithm becasue "
"the workspace size request("
<<
result
.
workspace_size
<<
") exceeds the limit("
<<
workspace_size_limit
<<
")"
;
PADDLE_ENFORCE_GPU_SUCCESS
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
algo
));
workspace_size_limit
,
&
(
result
.
algo
)
));
#endif
#endif
}
else
if
(
deterministic
)
{
}
return
CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1
;
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
,
workspace_size_limit
,
&
(
result
.
algo
)));
#endif
return
result
;
}
template
<
typename
T
>
static
SearchResult
<
AlgoT
>
FindAlgoExhaustiveSearch
(
const
ConvArgs
&
args
,
const
phi
::
GPUContext
&
ctx
)
{
SearchResult
<
AlgoT
>
result
;
int
returned_algo_count
=
0
;
std
::
vector
<
PerfT
>
perf_results
(
kNUM_CUDNN_BWD_FILTER_ALGS
);
size_t
workspace_size_limit
=
CalcWorkspaceLimitInBytes
(
UseFixedWorkspace
());
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
if
(
platform
::
CudnnDataType
<
T
>::
type
!=
CUDNN_DATA_HALF
)
{
size_t
max_workspace_size
=
GetMaxWorkspaceSize
(
args
,
workspace_size_limit
);
VLOG
(
3
)
<<
"max_workspace_size="
<<
ToMegaBytes
(
max_workspace_size
)
<<
" MB"
;
auto
cudnn_find_func
=
[
&
](
void
*
workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
const_cast
<
T
*>
(
args
.
w
->
data
<
T
>
()),
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
perf_results
.
data
(),
workspace_ptr
,
max_workspace_size
));
};
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
max_workspace_size
,
UseFixedWorkspace
());
VLOG
(
4
)
<<
GetPerfResultString
<
PerfT
>
(
"[Exhaustive Search] BwdFilterAlgo Perf result"
,
perf_results
,
returned_algo_count
,
workspace_size_limit
);
ChooseAlgoByWorkspace
<
PerfT
,
AlgoT
>
(
perf_results
,
workspace_size_limit
,
&
result
);
}
else
{
}
else
{
auto
&
dev_ctx
=
ctx
;
int
max_algos
=
GetAlgorithmMaxCount
(
args
.
handle
);
auto
workspace_handle
=
dev_ctx
.
cudnn_workspace_handle
();
std
::
vector
<
PerfT
>
perf_results
(
max_algos
);
AlgorithmsCache
<
algo_t
>&
algo_cache
=
PADDLE_ENFORCE_GPU_SUCCESS
(
*
(
framework
::
ConvSearchCache
::
Instance
().
GetBackwardFilter
());
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
auto
x_dims
=
phi
::
vectorize
(
args
.
x
->
dims
());
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
perf_results
.
size
(),
auto
w_dims
=
phi
::
vectorize
(
args
.
w
->
dims
());
&
returned_algo_count
,
perf_results
.
data
()));
perf_results
.
resize
(
returned_algo_count
);
VLOG
(
10
)
<<
"cudnnConvolutionFwdAlgoPerf_t:"
<<
", x_dims:"
<<
x_dims
<<
", w_dims:"
<<
w_dims
<<
", args.s"
VLOG
(
4
)
<<
GetPerfResultString
<
PerfT
>
(
<<
args
.
s
<<
", args.p"
<<
args
.
p
<<
", args.d"
<<
args
.
d
;
"[Exhaustive Search] BwdFilterAlgo Perf result"
,
perf_results
,
if
(
dtype
!=
CUDNN_DATA_HALF
)
{
perf_results
.
size
(),
workspace_size_limit
);
algo
=
algo_cache
.
GetAlgorithm
(
ChooseAlgo
(
perf_results
,
workspace_size_limit
,
&
result
);
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
}
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
int
returned_algo_count
;
return
result
;
std
::
array
<
perf_t
,
kNUM_CUDNN_BWD_FILTER_ALGS
>
perf_stat
;
}
auto
cudnn_find_func
=
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
static
int
GetAlgorithmMaxCount
(
cudnnHandle_t
handle
)
{
platform
::
dynload
::
#if CUDNN_VERSION_MIN(7, 0, 1)
cudnnFindConvolutionBackwardFilterAlgorithmEx
(
int
max_algos
=
0
;
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
x
->
data
<
T
>
(),
auto
status
=
args
.
odesc
.
desc
(),
args
.
o
->
data
<
T
>
(),
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterAlgorithmMaxCount
(
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
handle
,
&
max_algos
);
const_cast
<
T
*>
(
args
.
w
->
data
<
T
>
()),
if
(
status
==
gpuSuccess
)
{
kNUM_CUDNN_BWD_FILTER_ALGS
,
&
returned_algo_count
,
VLOG
(
5
)
<<
"[BackwardFilter] max_algos: predefined="
perf_stat
.
data
(),
cudnn_workspace_ptr
,
<<
kNUM_CUDNN_BWD_FILTER_ALGS
<<
", actual="
<<
max_algos
;
workspace_size_limit
));
return
max_algos
;
};
}
workspace_handle
.
RunFuncSync
(
cudnn_find_func
,
#endif
workspace_size_limit
);
return
kNUM_CUDNN_BWD_FILTER_ALGS
;
}
VLOG
(
3
)
<<
"BwdFilterAlgo Perf result: (algo: stat, time, memory)"
;
static
size_t
GetMaxWorkspaceSize
(
const
ConvArgs
&
args
,
for
(
int
i
=
0
;
i
<
returned_algo_count
;
++
i
)
{
size_t
workspace_size_limit
)
{
const
auto
&
stat
=
perf_stat
[
i
];
if
(
!
UseFixedWorkspace
())
{
VLOG
(
3
)
<<
stat
.
algo
<<
": "
<<
stat
.
status
<<
" "
<<
stat
.
time
size_t
max_workspace_size
=
0
;
<<
" "
<<
stat
.
memory
;
for
(
size_t
algo
=
0
;
algo
<
kNUM_CUDNN_BWD_FILTER_ALGS
;
++
algo
)
{
}
size_t
workspace_size
=
0
;
return
perf_stat
[
0
].
algo
;
auto
status
=
});
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
}
else
{
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
auto
max_algos
=
MaxBwdFilterAlgos
(
args
.
handle
);
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
algo
=
algo_cache
.
GetAlgorithm
(
static_cast
<
cudnnConvolutionBwdFilterAlgo_t
>
(
algo
),
x_dims
,
w_dims
,
args
.
s
,
args
.
p
,
args
.
d
,
0
,
&
workspace_size
);
static_cast
<
int64_t
>
(
args
.
cudnn_dtype
),
[
&
]()
{
if
(
status
==
CUDNN_STATUS_SUCCESS
&&
algo_t
chosen_algo
;
workspace_size
<=
workspace_size_limit
)
{
std
::
vector
<
perf_t
>
perf_results
(
max_algos
);
max_workspace_size
=
std
::
max
(
workspace_size
,
max_workspace_size
);
int
actual_algos
=
0
;
}
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnFindConvolutionBackwardFilterAlgorithm
(
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
perf_results
.
size
(),
&
actual_algos
,
perf_results
.
data
()));
perf_results
.
resize
(
actual_algos
);
ChooseAlgo
<
perf_t
,
algo_t
>
(
perf_results
,
workspace_size_limit
,
&
chosen_algo
);
return
chosen_algo
;
});
}
}
return
max_workspace_size
;
}
else
{
return
workspace_size_limit
;
}
}
VLOG
(
3
)
<<
"choose algo "
<<
algo
;
return
algo
;
}
}
static
size_t
GetWorkspaceSize
(
const
ConvArgs
&
args
,
algo_t
algo
)
{
static
void
ChooseAlgo
(
const
std
::
vector
<
PerfT
>&
perf_results
,
platform
::
CUDAGraphCaptureModeGuard
guard
;
size_t
workspace_limit
,
size_t
workspace_size
=
0
;
SearchResult
<
AlgoT
>*
algo_result
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
for
(
size_t
i
=
0
;
i
!=
perf_results
.
size
();
++
i
)
{
platform
::
dynload
::
cudnnGetConvolutionBackwardFilterWorkspaceSize
(
const
auto
&
result
=
perf_results
[
i
];
args
.
handle
,
args
.
idesc
.
desc
(),
args
.
odesc
.
desc
(),
if
(
result
.
status
==
CUDNN_STATUS_SUCCESS
&&
args
.
cdesc
.
desc
(),
args
.
wdesc
.
desc
(),
algo
,
&
workspace_size
));
(
result
.
memory
<=
workspace_limit
))
{
return
workspace_size
;
if
((
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
&&
(
i
!=
perf_results
.
size
()
-
1
))
{
const
auto
&
next_result
=
perf_results
[
i
+
1
];
if
(
next_result
.
status
==
CUDNN_STATUS_SUCCESS
&&
next_result
.
algo
==
result
.
algo
&&
next_result
.
memory
==
result
.
memory
&&
next_result
.
mathType
!=
CUDNN_TENSOR_OP_MATH
&&
next_result
.
time
<
1.01
*
result
.
time
)
{
// Skip over this result- it's not really a Tensor Core algo.
// Because it is only 1% performance difference.
// Prefer to choose the next equivalent non-Tensor Core algo.
continue
;
}
}
algo_result
->
algo
=
result
.
algo
;
algo_result
->
time
=
result
.
time
;
auto
math_type_str
=
"0"
;
if
(
result
.
mathType
==
CUDNN_TENSOR_OP_MATH
)
{
math_type_str
=
"1"
;
}
VLOG
(
3
)
<<
" choose algo: "
<<
result
.
algo
<<
", TC: "
<<
math_type_str
<<
", time: "
<<
result
.
time
<<
" ms, wksp = "
<<
result
.
memory
<<
", status = "
<<
result
.
status
;
break
;
}
}
}
}
};
};
...
...
paddle/fluid/operators/conv_cudnn_op_cache.h
浏览文件 @
b4adbe5c
...
@@ -20,7 +20,7 @@ limitations under the License. */
...
@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_
u
int64
(
conv_workspace_size_limit
);
DECLARE_int64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_int64
(
cudnn_exhaustive_search_times
);
DECLARE_int64
(
cudnn_exhaustive_search_times
);
...
...
paddle/fluid/operators/conv_miopen_helper.h
浏览文件 @
b4adbe5c
...
@@ -14,42 +14,12 @@ limitations under the License. */
...
@@ -14,42 +14,12 @@ limitations under the License. */
#pragma once
#pragma once
#include <algorithm>
#include "paddle/fluid/operators/conv_base_helper.h"
#include <array>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/conv_search_cache.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
ConvArgs
=
ConvArgsBase
<
miopenHandle_t
,
miopenDataType_t
>
;
using
DataLayout
=
platform
::
DataLayout
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
using
framework
::
AlgorithmsCache
;
static
inline
void
GetNCDHW
(
const
framework
::
DDim
&
dims
,
const
DataLayout
&
layout
,
int
*
N
,
int
*
C
,
int
*
D
,
int
*
H
,
int
*
W
)
{
*
N
=
dims
[
0
];
*
C
=
layout
==
DataLayout
::
kNCHW
?
dims
[
1
]
:
dims
[
dims
.
size
()
-
1
];
int
i
=
layout
==
DataLayout
::
kNCHW
?
0
:
1
;
if
(
dims
.
size
()
==
5
)
{
*
D
=
dims
[
2
-
i
];
*
H
=
dims
[
3
-
i
];
*
W
=
dims
[
4
-
i
];
}
else
{
*
D
=
1
;
*
H
=
dims
[
2
-
i
];
*
W
=
dims
[
3
-
i
];
}
}
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
static
void
RemovePaddingSlice
(
const
phi
::
GPUContext
&
context
,
...
@@ -66,9 +36,8 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
...
@@ -66,9 +36,8 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
extents
[
i
]
=
new_out_dims
[
i
];
extents
[
i
]
=
new_out_dims
[
i
];
}
}
int
start
;
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
axes
.
size
();
++
i
)
{
start
=
starts
[
i
];
int
start
=
starts
[
i
];
if
(
start
<
0
)
{
if
(
start
<
0
)
{
start
=
(
start
+
in_dims
[
axes
[
i
]]);
start
=
(
start
+
in_dims
[
axes
[
i
]]);
}
}
...
@@ -85,41 +54,6 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
...
@@ -85,41 +54,6 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
out_t
.
device
(
place
)
=
in_t
.
slice
(
offsets
,
extents
);
out_t
.
device
(
place
)
=
in_t
.
slice
(
offsets
,
extents
);
}
}
template
<
typename
T
>
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
std
::
vector
<
T
>&
v
)
{
out
<<
"["
;
for
(
auto
const
&
tmp
:
v
)
out
<<
tmp
<<
","
;
out
<<
"]"
;
return
out
;
}
using
framework
::
ConvSearchCache
;
struct
ConvArgs
{
miopenHandle_t
handle
;
platform
::
TensorDescriptor
idesc
,
odesc
;
platform
::
FilterDescriptor
wdesc
;
platform
::
ConvolutionDescriptor
cdesc
;
const
framework
::
Tensor
*
x
,
*
w
,
*
o
;
miopenDataType_t
cudnn_dtype
;
// strides
std
::
vector
<
int
>
s
;
// paddings
std
::
vector
<
int
>
p
;
// dilations
std
::
vector
<
int
>
d
;
ConvArgs
(
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
w
,
const
framework
::
Tensor
*
o
,
const
std
::
vector
<
int
>
s
,
const
std
::
vector
<
int
>
p
,
const
std
::
vector
<
int
>
d
,
miopenDataType_t
dtype
)
:
x
(
x
),
w
(
w
),
o
(
o
),
s
(
s
),
p
(
p
),
d
(
d
),
cudnn_dtype
(
dtype
)
{}
};
template
<
typename
algo_t
>
struct
SearchAlgorithm
{};
template
<
>
template
<
>
struct
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
{
struct
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
{
using
perf_t
=
miopenConvAlgoPerf_t
;
using
perf_t
=
miopenConvAlgoPerf_t
;
...
...
paddle/fluid/operators/fused/fusion_conv_inception_op.cu
浏览文件 @
b4adbe5c
...
@@ -16,8 +16,6 @@ limitations under the License. */
...
@@ -16,8 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
DECLARE_uint64
(
conv_workspace_size_limit
);
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/platform/device/gpu/gpu_info.cc
浏览文件 @
b4adbe5c
...
@@ -188,6 +188,8 @@ class RecordedGpuMallocHelper {
...
@@ -188,6 +188,8 @@ class RecordedGpuMallocHelper {
if
(
UNLIKELY
(
malloc_managed_memory
))
{
if
(
UNLIKELY
(
malloc_managed_memory
))
{
result
=
cudaMallocManaged
(
ptr
,
size
);
result
=
cudaMallocManaged
(
ptr
,
size
);
}
else
{
}
else
{
VLOG
(
10
)
<<
"[cudaMalloc] size="
<<
static_cast
<
double
>
(
size
)
/
(
1
<<
20
)
<<
" MB"
;
result
=
cudaMalloc
(
ptr
,
size
);
result
=
cudaMalloc
(
ptr
,
size
);
}
}
#endif
#endif
...
@@ -226,6 +228,8 @@ class RecordedGpuMallocHelper {
...
@@ -226,6 +228,8 @@ class RecordedGpuMallocHelper {
if
(
err
!=
hipErrorDeinitialized
)
{
if
(
err
!=
hipErrorDeinitialized
)
{
#else
#else
auto
err
=
cudaFree
(
ptr
);
auto
err
=
cudaFree
(
ptr
);
VLOG
(
10
)
<<
"[cudaFree] size="
<<
static_cast
<
double
>
(
size
)
/
(
1
<<
20
)
<<
" MB"
;
if
(
err
!=
cudaErrorCudartUnloading
)
{
if
(
err
!=
cudaErrorCudartUnloading
)
{
#endif
#endif
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
PADDLE_ENFORCE_GPU_SUCCESS
(
err
);
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
b4adbe5c
...
@@ -522,8 +522,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
...
@@ -522,8 +522,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
cuda_stream_
.
reset
(
new
stream
::
CUDAStream
(
phi
::
GPUContext
::
stream
(),
place
));
cuda_stream_
.
reset
(
new
stream
::
CUDAStream
(
phi
::
GPUContext
::
stream
(),
place
));
auto
&
instance
=
memory
::
allocation
::
AllocatorFacade
::
Instance
();
auto
&
instance
=
memory
::
allocation
::
AllocatorFacade
::
Instance
();
instance
.
SetDefaultStream
(
place
,
phi
::
GPUContext
::
stream
());
instance
.
SetDefaultStream
(
place
,
phi
::
GPUContext
::
stream
());
workspace_
.
reset
(
workspace_
.
reset
(
new
phi
::
DnnWorkspaceHandle
(
new
phi
::
DnnWorkspaceHandle
(
instance
.
GetAllocator
(
place
).
get
()));
instance
.
GetAllocator
(
place
).
get
(),
stream
()));
}
}
CUDADeviceContext
::~
CUDADeviceContext
()
=
default
;
CUDADeviceContext
::~
CUDADeviceContext
()
=
default
;
...
@@ -623,7 +623,8 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
...
@@ -623,7 +623,8 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
return
phi
::
DnnWorkspaceHandle
(
return
phi
::
DnnWorkspaceHandle
(
memory
::
allocation
::
AllocatorFacade
::
Instance
()
memory
::
allocation
::
AllocatorFacade
::
Instance
()
.
GetAllocator
(
GetPlace
())
.
GetAllocator
(
GetPlace
())
.
get
());
.
get
(),
stream
());
}
}
return
phi
::
GPUContext
::
cudnn_workspace_handle
();
return
phi
::
GPUContext
::
cudnn_workspace_handle
();
}
}
...
...
paddle/fluid/platform/flags.cc
浏览文件 @
b4adbe5c
...
@@ -158,10 +158,9 @@ PADDLE_DEFINE_EXPORTED_bool(
...
@@ -158,10 +158,9 @@ PADDLE_DEFINE_EXPORTED_bool(
* increased.
* increased.
* Users need to balance memory and speed.
* Users need to balance memory and speed.
*/
*/
PADDLE_DEFINE_EXPORTED_uint64
(
PADDLE_DEFINE_EXPORTED_int64
(
conv_workspace_size_limit
,
conv_workspace_size_limit
,
paddle
::
platform
::
kDefaultConvWorkspaceSizeLimitMB
,
paddle
::
platform
::
kDefaultConvWorkspaceSizeLimitMB
,
"cuDNN convolution workspace limit in MB unit."
);
"cuDNN convolution workspace limit in MB unit."
);
/**
/**
* CUDNN related FLAG
* CUDNN related FLAG
...
@@ -800,3 +799,12 @@ DEFINE_bool(enable_ins_parser_file, false,
...
@@ -800,3 +799,12 @@ DEFINE_bool(enable_ins_parser_file, false,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_DEFINE_EXPORTED_bool
(
nccl_blocking_wait
,
false
,
"nccl blocking wait"
);
PADDLE_DEFINE_EXPORTED_bool
(
nccl_blocking_wait
,
false
,
"nccl blocking wait"
);
#endif
#endif
/**
* Autotune related FLAG
* Name: FLAGS_use_autotune
* Since Version: 2.3.0
* Value Range: bool, default=false
* Example:
*/
PADDLE_DEFINE_EXPORTED_bool
(
use_autotune
,
false
,
"Whether enable autotune."
);
paddle/fluid/pybind/pybind.cc
浏览文件 @
b4adbe5c
...
@@ -4430,7 +4430,7 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -4430,7 +4430,7 @@ All parameter, weight, gradient are variables in Paddle.
return
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
DisableAutoTune
();
return
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
DisableAutoTune
();
});
});
m
.
def
(
"autotune_range"
,
[](
int64_t
start
,
int64_t
stop
)
{
m
.
def
(
"
set_
autotune_range"
,
[](
int64_t
start
,
int64_t
stop
)
{
return
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
SetAutoTuneRange
(
start
,
return
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
SetAutoTuneRange
(
start
,
stop
);
stop
);
});
});
...
@@ -4439,10 +4439,8 @@ All parameter, weight, gradient are variables in Paddle.
...
@@ -4439,10 +4439,8 @@ All parameter, weight, gradient are variables in Paddle.
[]
{
return
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
Update
();
});
[]
{
return
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
Update
();
});
m
.
def
(
"autotune_status"
,
[]
{
m
.
def
(
"autotune_status"
,
[]
{
phi
::
autotune
::
AutoTuneCache
::
Instance
().
UpdateStatus
();
py
::
dict
res
;
py
::
dict
res
;
res
[
"use_autotune"
]
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
UpdateStatus
();
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
UseAutoTune
();
res
[
"step_id"
]
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
StepID
();
res
[
"step_id"
]
=
phi
::
autotune
::
AutoTuneStatus
::
Instance
().
StepID
();
res
[
"cache_size"
]
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
Size
();
res
[
"cache_size"
]
=
phi
::
autotune
::
AutoTuneCache
::
Instance
().
Size
();
res
[
"cache_hit_rate"
]
=
res
[
"cache_hit_rate"
]
=
...
...
paddle/phi/backends/gpu/gpu_context.cc
浏览文件 @
b4adbe5c
...
@@ -12,6 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -12,6 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include <algorithm>
#include <algorithm>
#include <array>
#include <array>
...
@@ -155,6 +156,39 @@ static void StreamCallbackFunc(gpuStream_t stream,
...
@@ -155,6 +156,39 @@ static void StreamCallbackFunc(gpuStream_t stream,
}
// namespace internal
}
// namespace internal
void
DnnWorkspaceHandle
::
RunFuncSync
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
size_t
required_workspace_bytes
,
bool
use_cached_allocation
)
{
bool
need_realloc
=
required_workspace_bytes
>
WorkspaceSize
();
if
(
need_realloc
&&
!
use_cached_allocation
)
{
void
*
workspace_ptr
=
nullptr
;
size_t
size
=
((
required_workspace_bytes
+
255
)
>>
8
)
<<
8
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
*
mtx_
);
#ifdef PADDLE_WITH_HIP
auto
status
=
hipMalloc
(
&
workspace_ptr
,
size
);
#else
auto
status
=
cudaMalloc
(
&
workspace_ptr
,
size
);
#endif
if
(
status
==
gpuSuccess
)
{
cudnn_func
(
workspace_ptr
);
phi
::
backends
::
gpu
::
GpuStreamSync
(
stream_
);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS
(
hipFree
(
workspace_ptr
));
#else
PADDLE_ENFORCE_GPU_SUCCESS
(
cudaFree
(
workspace_ptr
));
#endif
return
;
}
}
RunFunc
(
cudnn_func
,
required_workspace_bytes
);
if
(
need_realloc
)
{
// Release the workspace allocated in this running.
ResetWorkspace
();
}
}
void
DnnWorkspaceHandle
::
ResetWorkspace
()
{
allocation_
=
nullptr
;
}
void
DnnWorkspaceHandle
::
ResetWorkspace
()
{
allocation_
=
nullptr
;
}
void
DnnWorkspaceHandle
::
ReallocWorkspace
(
size_t
required_workspace_bytes
)
{
void
DnnWorkspaceHandle
::
ReallocWorkspace
(
size_t
required_workspace_bytes
)
{
...
@@ -295,13 +329,13 @@ struct GPUContext::Impl {
...
@@ -295,13 +329,13 @@ struct GPUContext::Impl {
void
InitDnnWorkspace
()
{
void
InitDnnWorkspace
()
{
PD_CHECK
(
allocator_
!=
nullptr
,
PD_CHECK
(
allocator_
!=
nullptr
,
"the device allocator for gpu context is nullptr."
);
"the device allocator for gpu context is nullptr."
);
workspace_
=
new
DnnWorkspaceHandle
(
allocator_
);
workspace_
=
new
DnnWorkspaceHandle
(
allocator_
,
stream_
);
}
}
void
DestoryInternalWorkspace
()
{
void
DestoryInternalWorkspace
()
{
if
(
owned_
&&
workspace_
!=
nullptr
)
{
if
(
owned_
&&
workspace_
!=
nullptr
)
{
delete
workspace_
;
delete
workspace_
;
stream
_
=
nullptr
;
workspace
_
=
nullptr
;
}
}
}
}
...
@@ -313,7 +347,7 @@ struct GPUContext::Impl {
...
@@ -313,7 +347,7 @@ struct GPUContext::Impl {
DnnWorkspaceHandle
GetDnnWorkspace
()
{
DnnWorkspaceHandle
GetDnnWorkspace
()
{
PD_CHECK
(
allocator_
!=
nullptr
,
PD_CHECK
(
allocator_
!=
nullptr
,
"the device allocator for gpu context is nullptr."
);
"the device allocator for gpu context is nullptr."
);
return
DnnWorkspaceHandle
(
allocator_
);
return
DnnWorkspaceHandle
(
allocator_
,
stream_
);
}
}
void
InitStream
()
{
void
InitStream
()
{
...
...
paddle/phi/backends/gpu/gpu_context.h
浏览文件 @
b4adbe5c
...
@@ -21,6 +21,7 @@ limitations under the License. */
...
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/forwards.h"
#include "paddle/phi/backends/gpu/forwards.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_decls.h"
#include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/backends/gpu/gpu_helper.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/device_context.h"
...
@@ -28,8 +29,8 @@ namespace phi {
...
@@ -28,8 +29,8 @@ namespace phi {
class
DnnWorkspaceHandle
{
class
DnnWorkspaceHandle
{
public:
public:
explicit
inline
DnnWorkspaceHandle
(
Allocator
*
allocator
)
inline
DnnWorkspaceHandle
(
Allocator
*
allocator
,
gpuStream_t
stream
)
:
allocator_
(
allocator
)
{
:
allocator_
(
allocator
)
,
stream_
(
stream
)
{
mtx_
.
reset
(
new
std
::
mutex
());
mtx_
.
reset
(
new
std
::
mutex
());
}
}
...
@@ -48,11 +49,9 @@ class DnnWorkspaceHandle {
...
@@ -48,11 +49,9 @@ class DnnWorkspaceHandle {
* running the function. Currently this function is only used when cudnn
* running the function. Currently this function is only used when cudnn
* exhaustive searching and callers have to guarantee that the input function
* exhaustive searching and callers have to guarantee that the input function
* is host blocking */
* is host blocking */
inline
void
RunFuncSync
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
void
RunFuncSync
(
const
std
::
function
<
void
(
void
*
)
>&
cudnn_func
,
size_t
required_workspace_bytes
)
{
size_t
required_workspace_bytes
,
RunFunc
(
cudnn_func
,
required_workspace_bytes
);
bool
use_cached_allocation
=
true
);
ResetWorkspace
();
}
inline
size_t
WorkspaceSize
()
{
inline
size_t
WorkspaceSize
()
{
if
(
allocation_
==
nullptr
)
{
if
(
allocation_
==
nullptr
)
{
...
@@ -70,7 +69,8 @@ class DnnWorkspaceHandle {
...
@@ -70,7 +69,8 @@ class DnnWorkspaceHandle {
private:
private:
Allocator
::
AllocationPtr
allocation_
{
nullptr
};
Allocator
::
AllocationPtr
allocation_
{
nullptr
};
Allocator
*
allocator_
{
nullptr
};
Allocator
*
allocator_
{
nullptr
};
// Not owned
gpuStream_t
stream_
{
nullptr
};
// Not owned
std
::
unique_ptr
<
std
::
mutex
>
mtx_
;
std
::
unique_ptr
<
std
::
mutex
>
mtx_
;
};
};
...
...
paddle/phi/kernels/CMakeLists.txt
浏览文件 @
b4adbe5c
...
@@ -6,12 +6,15 @@ file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h
...
@@ -6,12 +6,15 @@ file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h
# phi functors and functions called by kernels
# phi functors and functions called by kernels
add_subdirectory
(
funcs
)
add_subdirectory
(
funcs
)
# kernel autotune
add_subdirectory
(
autotune
)
# phi depends all phi kernel targets
# phi depends all phi kernel targets
set_property
(
GLOBAL PROPERTY PHI_KERNELS
""
)
set_property
(
GLOBAL PROPERTY PHI_KERNELS
""
)
# [ 1. Common kernel compilation dependencies ]
# [ 1. Common kernel compilation dependencies ]
set
(
COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel
)
set
(
COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel
)
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor
)
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
eigen_function blas math_function im2col vol2col concat_and_split_functor selected_rows_functor
)
# remove this dep after removing fluid deps on tensor creation
# remove this dep after removing fluid deps on tensor creation
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
phi_api_utils
)
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
phi_api_utils
)
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
infermeta
)
set
(
COMMON_KERNEL_DEPS
${
COMMON_KERNEL_DEPS
}
infermeta
)
...
@@ -27,13 +30,17 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
...
@@ -27,13 +30,17 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
# Some kernels depend on some targets that are not commonly used.
# Some kernels depend on some targets that are not commonly used.
# These targets are not suitable for common dependencies.
# These targets are not suitable for common dependencies.
# In this case, you need to manually generate them here.
# In this case, you need to manually generate them here.
set
(
MANUAL_BUILD_KERNELS cross_entropy_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel
set
(
AUTOTUNE_KERNELS conv_kernel conv_grad_kernel conv_grad_grad_kernel conv_transpose_kernel conv_transpose_grad_kernel
)
set
(
MANUAL_BUILD_KERNELS
${
AUTOTUNE_KERNELS
}
cross_entropy_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel
gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
triangular_solve_grad_kernel determinant_grad_kernel reduce_sum_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel
)
triangular_solve_grad_kernel determinant_grad_kernel reduce_sum_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel
)
foreach
(
src
${
AUTOTUNE_KERNELS
}
)
kernel_library
(
${
src
}
DEPS
${
COMMON_KERNEL_DEPS
}
switch_autotune
)
endforeach
()
kernel_library
(
cross_entropy_kernel DEPS
${
COMMON_KERNEL_DEPS
}
softmax cross_entropy
)
kernel_library
(
cross_entropy_kernel DEPS
${
COMMON_KERNEL_DEPS
}
softmax cross_entropy
)
kernel_library
(
deformable_conv_kernel DEPS
${
COMMON_KERNEL_DEPS
}
deformable_conv_functor
)
kernel_library
(
deformable_conv_kernel DEPS
${
COMMON_KERNEL_DEPS
}
deformable_conv_functor
)
kernel_library
(
deformable_conv_grad_kernel DEPS
${
COMMON_KERNEL_DEPS
}
deformable_conv_functor
)
kernel_library
(
deformable_conv_grad_kernel DEPS
${
COMMON_KERNEL_DEPS
}
deformable_conv_functor
)
...
@@ -74,6 +81,3 @@ add_subdirectory(selected_rows)
...
@@ -74,6 +81,3 @@ add_subdirectory(selected_rows)
copy_if_different
(
${
kernel_declare_file
}
${
kernel_declare_file_final
}
)
copy_if_different
(
${
kernel_declare_file
}
${
kernel_declare_file_final
}
)
# For strings kernels
# For strings kernels
add_subdirectory
(
strings
)
add_subdirectory
(
strings
)
# 5. kernel autotune
add_subdirectory
(
autotune
)
paddle/phi/kernels/autotune/CMakeLists.txt
浏览文件 @
b4adbe5c
if
(
WITH_GPU
)
if
(
WITH_GPU
)
nv_test
(
gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest
)
nv_test
(
gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest
)
nv_test
(
auto_tune_test SRCS auto_tune_test.cu DEPS gtest
)
nv_test
(
auto_tune_test SRCS auto_tune_test.cu DEPS gtest
)
elseif
(
WITH_ROCM
)
elseif
(
WITH_ROCM
)
hip_test
(
gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest
)
hip_test
(
gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest
)
hip_test
(
auto_tune_test SRCS auto_tune_test.cu DEPS gtest
)
hip_test
(
auto_tune_test SRCS auto_tune_test.cu DEPS gtest
)
endif
()
endif
()
cc_library
(
cache SRCS cache.cc DEPS boost
)
cc_library
(
cache SRCS cache.cc DEPS boost
)
cc_library
(
switch_autotune SRCS switch_autotune.cc DEPS cache flags
)
cc_test
(
cache_test SRCS cache_test.cc DEPS gtest cache
)
cc_test
(
cache_test SRCS cache_test.cc DEPS gtest cache
)
paddle/phi/kernels/autotune/cache.cc
浏览文件 @
b4adbe5c
...
@@ -13,6 +13,8 @@
...
@@ -13,6 +13,8 @@
// limitations under the License.
// limitations under the License.
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include <iomanip>
#include "glog/logging.h"
namespace
phi
{
namespace
phi
{
namespace
autotune
{
namespace
autotune
{
...
@@ -32,5 +34,40 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
...
@@ -32,5 +34,40 @@ size_t ConvKey(const std::vector<int64_t>& x_dims,
static_cast
<
int64_t
>
(
dtype
));
static_cast
<
int64_t
>
(
dtype
));
}
}
std
::
string
AlgorithmTypeString
(
int64_t
algo_type
)
{
if
(
algo_type
==
static_cast
<
int64_t
>
(
AlgorithmType
::
kConvForward
))
{
return
"conv_forward"
;
}
else
if
(
algo_type
==
static_cast
<
int64_t
>
(
AlgorithmType
::
kConvBackwardData
))
{
return
"conv_backward_data"
;
}
else
if
(
algo_type
==
static_cast
<
int64_t
>
(
AlgorithmType
::
kConvBackwardFilter
))
{
return
"conv_backward_filter"
;
}
return
std
::
to_string
(
algo_type
);
}
void
AutoTuneCache
::
UpdateStatus
()
{
int64_t
size
=
0
;
int64_t
cache_hits
=
0
;
int64_t
cache_misses
=
0
;
int
name_width
=
24
;
std
::
cout
.
setf
(
std
::
ios
::
left
);
for
(
auto
&
v
:
auto_tune_map_
)
{
VLOG
(
4
)
<<
"AlgoType: "
<<
std
::
setfill
(
' '
)
<<
std
::
setw
(
name_width
)
<<
AlgorithmTypeString
(
v
.
first
)
<<
" Cache Size: "
<<
v
.
second
.
Size
()
<<
" Hits: "
<<
v
.
second
.
CacheHits
()
<<
" Misses: "
<<
v
.
second
.
CacheMisses
()
<<
" Hit Rate: "
<<
v
.
second
.
CacheHitRate
();
size
+=
v
.
second
.
Size
();
cache_hits
+=
v
.
second
.
CacheHits
();
cache_misses
+=
v
.
second
.
CacheMisses
();
}
total_size_
=
size
;
total_cache_hits_
=
cache_hits
;
total_cache_misses_
=
cache_misses
;
}
}
// namespace autotune
}
// namespace autotune
}
// namespace phi
}
// namespace phi
paddle/phi/kernels/autotune/cache.h
浏览文件 @
b4adbe5c
...
@@ -13,11 +13,12 @@
...
@@ -13,11 +13,12 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include <algorithm>
#include <algorithm>
#include <mutex>
#include <mutex>
#include <numeric>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "glog/logging.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/errors.h"
#include "paddle/phi/core/errors.h"
...
@@ -92,6 +93,13 @@ class AlgorithmsCache {
...
@@ -92,6 +93,13 @@ class AlgorithmsCache {
return
ret
;
return
ret
;
}
}
void
Clean
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
cache_mutex_
);
hash_
.
clear
();
cache_hits_
=
0
;
cache_misses_
=
0
;
}
void
Set
(
size_t
key
,
AlgorithmT
algo
)
{
void
Set
(
size_t
key
,
AlgorithmT
algo
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
cache_mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
cache_mutex_
);
hash_
[
key
]
=
algo
;
hash_
[
key
]
=
algo
;
...
@@ -116,15 +124,22 @@ class AlgorithmsCache {
...
@@ -116,15 +124,22 @@ class AlgorithmsCache {
private:
private:
std
::
unordered_map
<
size_t
,
AlgorithmT
>
hash_
;
std
::
unordered_map
<
size_t
,
AlgorithmT
>
hash_
;
std
::
shared_ptr
<
std
::
mutex
>
cache_mutex_
;
std
::
shared_ptr
<
std
::
mutex
>
cache_mutex_
;
int64_t
cache_hits_
=
0
;
int64_t
cache_misses_
=
0
;
int64_t
cache_hits_
{
0
};
int64_t
cache_misses_
{
0
};
};
enum
class
AlgorithmType
{
kConvForward
=
1
,
kConvBackwardData
=
2
,
kConvBackwardFilter
=
3
,
kAlgorithmCount
=
4
};
};
// AlgorithmsConfigKey -> AlgorithmsID
// AlgorithmsConfigKey -> AlgorithmsID
using
AlgorithmsConfigKeyMap
=
AlgorithmsCache
<
int64_t
>
;
using
AlgorithmsCacheMap
=
AlgorithmsCache
<
int64_t
>
;
// AlgorithmsType -> AlgorithmsCache
// AlgorithmType -> AlgorithmsCache
using
AlgorithmsTypeMap
=
using
AlgorithmsTypeMap
=
std
::
unordered_map
<
int64_t
,
AlgorithmsCacheMap
>
;
std
::
unordered_map
<
std
::
string
,
AlgorithmsConfigKeyMap
>
;
class
AutoTuneCache
{
class
AutoTuneCache
{
public:
public:
...
@@ -133,42 +148,30 @@ class AutoTuneCache {
...
@@ -133,42 +148,30 @@ class AutoTuneCache {
return
autotune_cache
;
return
autotune_cache
;
}
}
AlgorithmsConfigKeyMap
&
RegisterOrGet
(
const
std
::
string
&
algo_type
)
{
AlgorithmsCacheMap
&
Get
(
const
AlgorithmType
&
algo_type
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
autotune_cache_mutex_
);
return
auto_tune_map_
[
static_cast
<
int64_t
>
(
algo_type
)];
if
(
auto_tune_map_
.
find
(
algo_type
)
==
auto_tune_map_
.
end
())
{
AlgorithmsConfigKeyMap
cache
;
auto_tune_map_
[
algo_type
]
=
cache
;
}
return
auto_tune_map_
[
algo_type
];
}
}
void
Clean
(
float
miss_rate
)
{
AlgorithmsCacheMap
&
GetConvForward
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
autotune_cache_mutex_
);
return
Get
(
AlgorithmType
::
kConvForward
);
// Set a small tolerance to avoid performance degradation
}
// due to large cache size under dynamic shape.
if
(
miss_rate
>
0.01
)
{
AlgorithmsCacheMap
&
GetConvBackwardData
()
{
auto_tune_map_
.
clear
();
return
Get
(
AlgorithmType
::
kConvBackwardData
);
}
}
AlgorithmsCacheMap
&
GetConvBackwardFilter
()
{
return
Get
(
AlgorithmType
::
kConvBackwardFilter
);
}
}
void
UpdateStatus
()
{
void
Clean
()
{
int64_t
size
=
0
;
int64_t
cache_hits
=
0
;
int64_t
cache_misses
=
0
;
for
(
auto
&
v
:
auto_tune_map_
)
{
for
(
auto
&
v
:
auto_tune_map_
)
{
VLOG
(
4
)
<<
"AlgoType: "
<<
v
.
first
<<
" Cache Size: "
<<
v
.
second
.
Size
()
v
.
second
.
Clean
();
<<
" Hits: "
<<
v
.
second
.
CacheHits
()
<<
" Misses: "
<<
v
.
second
.
CacheMisses
()
<<
" Hit Rate: "
<<
v
.
second
.
CacheHitRate
();
size
+=
v
.
second
.
Size
();
cache_hits
+=
v
.
second
.
CacheHits
();
cache_misses
+=
v
.
second
.
CacheMisses
();
}
}
total_size_
=
size
;
total_cache_hits_
=
cache_hits
;
total_cache_misses_
=
cache_misses
;
}
}
void
UpdateStatus
();
// The number of total config cached
// The number of total config cached
int64_t
Size
()
const
{
return
total_size_
;
}
int64_t
Size
()
const
{
return
total_size_
;
}
...
@@ -183,17 +186,30 @@ class AutoTuneCache {
...
@@ -183,17 +186,30 @@ class AutoTuneCache {
total_cache_hit_rate
=
static_cast
<
float
>
(
total_cache_hits_
)
/
total_cache_hit_rate
=
static_cast
<
float
>
(
total_cache_hits_
)
/
static_cast
<
float
>
(
total_num_accesses
);
static_cast
<
float
>
(
total_num_accesses
);
}
}
return
total_cache_hit_rate
;
return
total_cache_hit_rate
;
}
}
private:
private:
AutoTuneCache
()
:
autotune_cache_mutex_
(
new
std
::
mutex
())
{}
AutoTuneCache
()
:
autotune_cache_mutex_
(
new
std
::
mutex
())
{
for
(
int
i
=
1
;
i
<
static_cast
<
int
>
(
AlgorithmType
::
kAlgorithmCount
);
++
i
)
{
Register
(
static_cast
<
AlgorithmType
>
(
i
));
}
}
void
Register
(
const
AlgorithmType
&
algo_type
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
*
autotune_cache_mutex_
);
int64_t
key
=
static_cast
<
int64_t
>
(
algo_type
);
if
(
auto_tune_map_
.
find
(
key
)
==
auto_tune_map_
.
end
())
{
AlgorithmsCacheMap
cache
;
auto_tune_map_
[
key
]
=
cache
;
}
}
AlgorithmsTypeMap
auto_tune_map_
;
AlgorithmsTypeMap
auto_tune_map_
;
std
::
shared_ptr
<
std
::
mutex
>
autotune_cache_mutex_
;
std
::
shared_ptr
<
std
::
mutex
>
autotune_cache_mutex_
;
int64_t
total_cache_hits_
=
0
;
int64_t
total_cache_hits_
{
0
}
;
int64_t
total_cache_misses_
=
0
;
int64_t
total_cache_misses_
{
0
}
;
int64_t
total_size_
=
0
;
int64_t
total_size_
{
0
}
;
};
};
}
// namespace autotune
}
// namespace autotune
...
...
paddle/phi/kernels/autotune/cache_test.cc
浏览文件 @
b4adbe5c
...
@@ -22,7 +22,7 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
...
@@ -22,7 +22,7 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
TEST
(
AlgosCache
,
AlgosCache
)
{
TEST
(
AlgosCache
,
AlgosCache
)
{
auto
autotune_cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
();
auto
autotune_cache
=
phi
::
autotune
::
AutoTuneCache
::
Instance
();
auto
&
cache
=
autotune_cache
.
RegisterOrGet
(
"conv_fw"
);
auto
&
cache
=
autotune_cache
.
GetConvForward
(
);
std
::
vector
<
int64_t
>
x_shape
=
{
4
,
224
,
224
,
3
};
std
::
vector
<
int64_t
>
x_shape
=
{
4
,
224
,
224
,
3
};
std
::
vector
<
int64_t
>
w_shape
=
{
32
,
3
,
3
,
3
};
std
::
vector
<
int64_t
>
w_shape
=
{
32
,
3
,
3
,
3
};
...
...
paddle/phi/kernels/autotune/switch_autotune.cc
0 → 100644
浏览文件 @
b4adbe5c
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/autotune/switch_autotune.h"
#include "gflags/gflags.h"
#include "glog/logging.h"
DECLARE_bool
(
use_autotune
);
namespace
phi
{
namespace
autotune
{
void
AutoTuneStatus
::
EnableAutoTune
()
{
FLAGS_use_autotune
=
true
;
Init
();
}
void
AutoTuneStatus
::
DisableAutoTune
()
{
FLAGS_use_autotune
=
false
;
Init
();
}
void
AutoTuneStatus
::
Update
()
{
current_steps_id_
+=
1
;
if
(
!
FLAGS_use_autotune
)
{
return
;
}
// This fuction is called when each iter finished.
if
(
current_steps_id_
+
1
<
start_step_id_
)
{
use_autotune_
=
false
;
}
else
if
(
current_steps_id_
+
1
>=
start_step_id_
&&
current_steps_id_
+
1
<
stop_step_id_
)
{
use_autotune_
=
true
;
AutoTuneCache
::
Instance
().
UpdateStatus
();
step_hit_rates_
.
push_back
(
StepHitRate
());
VLOG
(
3
)
<<
"Step ID: "
<<
current_steps_id_
<<
", Accumulative Cache Hit Rate: "
<<
static_cast
<
int
>
(
AutoTuneCache
::
Instance
().
CacheHitRate
()
*
100
)
<<
"%, Cache Size: "
<<
AutoTuneCache
::
Instance
().
Size
()
<<
", Current Step Hit Rate: "
<<
static_cast
<
int
>
(
StepHitRate
()
*
100
)
<<
"%"
;
}
else
{
use_autotune_
=
false
;
// Set a small tolerance to avoid performance degradation
// due to large cache size under dynamic shape.
// TODO(limingshu): Currently works for conv op only, this
// method shall be opimized when more ops involved in.
// float miss_rate = static_cast<float>(1) - RecentHitRate();
// if (current_steps_id_ == stop_step_id_) {
// AutoTuneCache::Instance().Clean(miss_rate);
// }
if
(
VLOG_IS_ON
(
4
))
{
AutoTuneCache
::
Instance
().
UpdateStatus
();
VLOG
(
4
)
<<
"Step ID: "
<<
current_steps_id_
<<
", Current Step Hit Rate: "
<<
static_cast
<
int
>
(
StepHitRate
()
*
100
)
<<
"%"
;
}
}
}
}
// namespace autotune
}
// namespace phi
paddle/phi/kernels/autotune/switch_autotune.h
浏览文件 @
b4adbe5c
...
@@ -13,10 +13,8 @@
...
@@ -13,10 +13,8 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include <cmath>
#include <cmath>
#include <mutex>
#include <numeric>
#include "glog/logging.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/autotune/cache.h"
namespace
phi
{
namespace
phi
{
...
@@ -31,45 +29,11 @@ class AutoTuneStatus {
...
@@ -31,45 +29,11 @@ class AutoTuneStatus {
bool
UseAutoTune
()
{
return
use_autotune_
;
}
bool
UseAutoTune
()
{
return
use_autotune_
;
}
// EnableAutoTune and DisableAutoTune Should be used for debug only.
// EnableAutoTune and DisableAutoTune should be used for debug only.
void
EnableAutoTune
()
{
void
EnableAutoTune
();
use_autotune_
=
true
;
void
DisableAutoTune
();
Init
();
}
void
DisableAutoTune
()
{
use_autotune_
=
false
;
Init
();
}
void
Update
()
{
void
Update
();
current_steps_id_
+=
1
;
if
(
!
use_autotune_
&&
!
update_use_autotune_
)
{
return
;
}
if
(
current_steps_id_
<
start_step_id_
)
{
use_autotune_
=
false
;
}
else
if
(
current_steps_id_
>=
start_step_id_
&&
current_steps_id_
<
stop_step_id_
)
{
use_autotune_
=
true
;
AutoTuneCache
::
Instance
().
UpdateStatus
();
step_hit_rates_
.
push_back
(
StepHitRate
());
VLOG
(
3
)
<<
"Step ID "
<<
current_steps_id_
<<
", Accumulative Cache Hit Rate: "
<<
AutoTuneCache
::
Instance
().
CacheHitRate
()
<<
", Cache Size: "
<<
AutoTuneCache
::
Instance
().
Size
()
<<
", Current Step Hit Rate: "
<<
StepHitRate
();
}
else
if
(
current_steps_id_
==
stop_step_id_
)
{
use_autotune_
=
false
;
update_use_autotune_
=
false
;
// clean cache according miss rate
float
miss_rate
=
static_cast
<
float
>
(
1
)
-
RecentHitRate
();
AutoTuneCache
::
Instance
().
Clean
(
miss_rate
);
VLOG
(
3
)
<<
"Recent Miss Rate: "
<<
miss_rate
;
}
}
int64_t
StepID
()
{
return
current_steps_id_
;
}
int64_t
StepID
()
{
return
current_steps_id_
;
}
...
@@ -84,19 +48,25 @@ class AutoTuneStatus {
...
@@ -84,19 +48,25 @@ class AutoTuneStatus {
// Hit Rate of Current Step
// Hit Rate of Current Step
float
StepHitRate
()
{
float
StepHitRate
()
{
int64_t
current_hits
=
AutoTuneCache
::
Instance
().
CacheHits
();
static
int64_t
last_step_id
=
-
2
;
int64_t
current_misses
=
AutoTuneCache
::
Instance
().
CacheMisses
();
int64_t
step_hits_
=
current_hits
-
previous_hits_
;
if
(
last_step_id
!=
current_steps_id_
)
{
int64_t
step_misses_
=
current_misses
-
previous_misses_
;
int64_t
current_hits
=
AutoTuneCache
::
Instance
().
CacheHits
();
float
step_hit_rate
=
0.
;
int64_t
current_misses
=
AutoTuneCache
::
Instance
().
CacheMisses
();
int64_t
step_num_accesses
=
step_hits_
+
step_misses_
;
int64_t
step_hits_
=
current_hits
-
previous_hits_
;
if
(
step_num_accesses
!=
0
)
{
int64_t
step_misses_
=
current_misses
-
previous_misses_
;
step_hit_rate
=
static_cast
<
float
>
(
step_hits_
)
/
float
step_hit_rate
=
0.
;
static_cast
<
float
>
(
step_num_accesses
);
int64_t
step_num_accesses
=
step_hits_
+
step_misses_
;
if
(
step_num_accesses
!=
0
)
{
step_hit_rate
=
static_cast
<
float
>
(
step_hits_
)
/
static_cast
<
float
>
(
step_num_accesses
);
}
previous_hits_
=
current_hits
;
previous_misses_
=
current_misses
;
current_step_hit_rate_
=
step_hit_rate
;
last_step_id
=
current_steps_id_
;
}
}
previous_hits_
=
current_hits
;
return
current_step_hit_rate_
;
previous_misses_
=
current_misses
;
return
step_hit_rate
;
}
}
void
SetAutoTuneRange
(
int64_t
start
,
int64_t
stop
)
{
void
SetAutoTuneRange
(
int64_t
start
,
int64_t
stop
)
{
...
@@ -108,21 +78,21 @@ class AutoTuneStatus {
...
@@ -108,21 +78,21 @@ class AutoTuneStatus {
AutoTuneStatus
()
=
default
;
AutoTuneStatus
()
=
default
;
void
Init
()
{
void
Init
()
{
u
pdate_use_autotune_
=
use_autotune_
;
u
se_autotune_
=
false
;
current_steps_id_
=
-
1
;
current_steps_id_
=
-
1
;
previous_hits_
=
0
;
previous_hits_
=
0
;
previous_misses_
=
0
;
previous_misses_
=
0
;
step_hit_rates_
.
clear
();
step_hit_rates_
.
clear
();
AutoTuneCache
::
Instance
().
Clean
(
1.0
);
AutoTuneCache
::
Instance
().
Clean
();
}
}
int64_t
start_step_id_
=
0
;
bool
use_autotune_
{
false
}
;
int64_t
st
op_step_id_
=
10
;
int64_t
st
art_step_id_
{
1
}
;
int64_t
current_steps_id_
=
-
1
;
int64_t
stop_step_id_
{
10
}
;
bool
use_autotune_
=
false
;
int64_t
current_steps_id_
{
-
1
}
;
bool
update_use_autotune_
=
false
;
int64_t
previous_hits_
{
0
}
;
int64_t
previous_
hits_
=
0
;
int64_t
previous_
misses_
{
0
}
;
int64_t
previous_misses_
=
0
;
float
current_step_hit_rate_
{
0.
f
}
;
std
::
vector
<
float
>
step_hit_rates_
;
std
::
vector
<
float
>
step_hit_rates_
;
};
};
...
...
paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
浏览文件 @
b4adbe5c
...
@@ -289,21 +289,17 @@ void ConvCudnnGradGradKernel(
...
@@ -289,21 +289,17 @@ void ConvCudnnGradGradKernel(
dtype
};
dtype
};
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t
fwd_algo1
=
static_cast
<
miopenConvFwdAlgorithm_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result1
;
miopenConvFwdAlgorithm_t
fwd_algo2
=
static_cast
<
miopenConvFwdAlgorithm_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result2
;
miopenConvBwdDataAlgorithm_t
data_algo
=
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
data_result
;
static_cast
<
miopenConvBwdDataAlgorithm_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
miopenConvBwdWeightsAlgorithm_t
filter_algo
=
filter_result
;
static_cast
<
miopenConvBwdWeightsAlgorithm_t
>
(
0
);
#else
#else
cudnnConvolutionFwdAlgo_t
fwd_algo1
=
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result1
;
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result2
;
cudnnConvolutionFwdAlgo_t
fwd_algo2
=
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
data_result
;
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
cudnnConvolutionBwdDataAlgo_t
data_algo
=
filter_result
;
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
0
);
cudnnConvolutionBwdFilterAlgo_t
filter_algo
=
static_cast
<
cudnnConvolutionBwdFilterAlgo_t
>
(
0
);
#endif
#endif
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
...
@@ -332,13 +328,13 @@ void ConvCudnnGradGradKernel(
...
@@ -332,13 +328,13 @@ void ConvCudnnGradGradKernel(
using
search1
=
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
fwd_
algo1
=
search1
::
Find
<
T
>
(
fwd_
result1
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
args1
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
#else
using
search1
=
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_
algo
1
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
ctx
);
fwd_
result
1
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
fwd_
algo1
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
fwd_
result1
.
algo
);
#endif
#endif
}
}
...
@@ -360,14 +356,14 @@ void ConvCudnnGradGradKernel(
...
@@ -360,14 +356,14 @@ void ConvCudnnGradGradKernel(
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
fwd_
algo2
=
search2
::
Find
<
T
>
(
fwd_
result2
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
args2
,
exhaustive_search
,
false
,
workspace_size
,
ctx
);
#else
#else
using
search2
=
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
fwd_
algo
2
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
ctx
);
fwd_
result
2
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
false
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
fwd_algo2
));
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
fwd_result2
.
algo
));
#endif
#endif
}
}
}
}
...
@@ -389,15 +385,15 @@ void ConvCudnnGradGradKernel(
...
@@ -389,15 +385,15 @@ void ConvCudnnGradGradKernel(
using
search3
=
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_algo
=
search3
::
Find
<
T
>
(
filter_
result
.
algo
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
args3
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search3
=
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_
algo
=
filter_
result
=
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
ctx
);
search3
::
Find
<
T
>
(
args3
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_
algo
));
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_result
.
algo
));
#endif
#endif
}
}
...
@@ -419,14 +415,15 @@ void ConvCudnnGradGradKernel(
...
@@ -419,14 +415,15 @@ void ConvCudnnGradGradKernel(
using
search4
=
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_algo
=
search4
::
Find
<
T
>
(
data_
result
.
algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
args4
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search4
=
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
data_algo
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
ctx
);
data_result
=
workspace_size
=
search4
::
Find
<
T
>
(
args4
,
exhaustive_search
,
deterministic
,
ctx
);
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_algo
));
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_result
.
algo
));
#endif
#endif
}
}
...
@@ -471,7 +468,7 @@ void ConvCudnnGradGradKernel(
...
@@ -471,7 +468,7 @@ void ConvCudnnGradGradKernel(
args1
.
wdesc
.
desc
(),
args1
.
wdesc
.
desc
(),
w
,
w
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
fwd_
algo1
,
fwd_
result1
.
algo
,
&
beta
,
&
beta
,
args1
.
odesc
.
desc
(),
args1
.
odesc
.
desc
(),
transformed_ddy_channel
,
transformed_ddy_channel
,
...
@@ -492,7 +489,7 @@ void ConvCudnnGradGradKernel(
...
@@ -492,7 +489,7 @@ void ConvCudnnGradGradKernel(
args1
.
wdesc
.
desc
(),
args1
.
wdesc
.
desc
(),
w
+
i
*
group_offset_filter
,
w
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
fwd_
algo1
,
fwd_
result1
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
@@ -517,7 +514,7 @@ void ConvCudnnGradGradKernel(
...
@@ -517,7 +514,7 @@ void ConvCudnnGradGradKernel(
args2
.
wdesc
.
desc
(),
args2
.
wdesc
.
desc
(),
ddw
,
ddw
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
fwd_
algo2
,
fwd_
result2
.
algo
,
&
beta
,
&
beta
,
args2
.
odesc
.
desc
(),
args2
.
odesc
.
desc
(),
transformed_ddy_channel
,
transformed_ddy_channel
,
...
@@ -538,7 +535,7 @@ void ConvCudnnGradGradKernel(
...
@@ -538,7 +535,7 @@ void ConvCudnnGradGradKernel(
args2
.
wdesc
.
desc
(),
args2
.
wdesc
.
desc
(),
ddw
+
i
*
group_offset_filter
,
ddw
+
i
*
group_offset_filter
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
fwd_
algo2
,
fwd_
result2
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
alpha
,
&
alpha
,
...
@@ -568,7 +565,7 @@ void ConvCudnnGradGradKernel(
...
@@ -568,7 +565,7 @@ void ConvCudnnGradGradKernel(
args3
.
idesc
.
desc
(),
args3
.
idesc
.
desc
(),
ddx
,
ddx
,
args3
.
cdesc
.
desc
(),
args3
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
&
beta
,
&
beta
,
args3
.
wdesc
.
desc
(),
args3
.
wdesc
.
desc
(),
dw
,
dw
,
...
@@ -589,7 +586,7 @@ void ConvCudnnGradGradKernel(
...
@@ -589,7 +586,7 @@ void ConvCudnnGradGradKernel(
args3
.
odesc
.
desc
(),
args3
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
transformed_dy_channel
+
i
*
group_offset_out
,
args3
.
cdesc
.
desc
(),
args3
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
@@ -615,7 +612,7 @@ void ConvCudnnGradGradKernel(
...
@@ -615,7 +612,7 @@ void ConvCudnnGradGradKernel(
args4
.
wdesc
.
desc
(),
args4
.
wdesc
.
desc
(),
ddw
,
ddw
,
args4
.
cdesc
.
desc
(),
args4
.
cdesc
.
desc
(),
data_algo
,
data_
result
.
algo
,
&
beta
,
&
beta
,
args4
.
idesc
.
desc
(),
args4
.
idesc
.
desc
(),
transformed_dx
,
transformed_dx
,
...
@@ -636,7 +633,7 @@ void ConvCudnnGradGradKernel(
...
@@ -636,7 +633,7 @@ void ConvCudnnGradGradKernel(
args4
.
odesc
.
desc
(),
args4
.
odesc
.
desc
(),
transformed_dy_channel
+
i
*
group_offset_out
,
transformed_dy_channel
+
i
*
group_offset_out
,
args4
.
cdesc
.
desc
(),
args4
.
cdesc
.
desc
(),
data_algo
,
data_
result
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
...
paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
浏览文件 @
b4adbe5c
...
@@ -322,17 +322,16 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -322,17 +322,16 @@ void ConvCudnnGradKernel(const Context& ctx,
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
int
group_offset_in
=
i_c
/
groups
*
i_h
*
i_w
*
i_d
;
int
group_offset_out
=
o_c
/
groups
*
o_h
*
o_w
*
o_d
;
int
group_offset_out
=
o_c
/
groups
*
o_h
*
o_w
*
o_d
;
int
group_offset_filter
=
transformed_filter_channel
.
numel
()
/
groups
;
int
group_offset_filter
=
transformed_filter_channel
.
numel
()
/
groups
;
// ------------------- cudnn backward algorithm ---------------------
// ------------------- cudnn backward algorithm ---------------------
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t
data_algo
=
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
static_cast
<
miopenConvBwdDataAlgorithm_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
miopenConvBwdWeightsAlgorithm_t
filter_algo
=
filter_result
;
static_cast
<
miopenConvBwdWeightsAlgorithm_t
>
(
0
);
#else
#else
cudnnConvolutionBwdDataAlgo_t
data_algo
=
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
cudnnConvolutionBwdFilterAlgo_t
filter_algo
=
filter_result
;
static_cast
<
cudnnConvolutionBwdFilterAlgo_t
>
(
0
);
#endif
#endif
// input data workspace_size
// input data workspace_size
size_t
workspace_size_d
=
0
;
size_t
workspace_size_d
=
0
;
...
@@ -368,14 +367,14 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -368,14 +367,14 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size_d
=
workspace_size_d
=
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
));
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
));
data_
algo
=
search1
::
Find
<
T
>
(
bwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
workspace_size_d
,
ctx
);
args1
,
exhaustive_search
,
deterministic
,
workspace_size_d
,
ctx
);
#else
#else
using
search1
=
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
data_algo
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
ctx
);
bwd_result
=
search1
::
Find
<
T
>
(
args1
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size_d
=
workspace_size_d
=
std
::
max
(
std
::
max
(
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
,
data_
algo
));
workspace_size_d
,
search1
::
GetWorkspaceSize
(
args1
,
bwd_result
.
algo
));
#endif
#endif
}
}
...
@@ -397,15 +396,17 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -397,15 +396,17 @@ void ConvCudnnGradKernel(const Context& ctx,
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size_w
=
workspace_size_w
=
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
));
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
));
filter_algo
=
search2
::
Find
<
T
>
(
filter_
result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
workspace_size_w
,
ctx
);
args2
,
exhaustive_search
,
deterministic
,
workspace_size_w
,
ctx
);
#else
#else
using
search2
=
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_
algo
=
filter_
result
=
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
ctx
);
search2
::
Find
<
T
>
(
args2
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size_w
=
std
::
max
(
workspace_size_w
,
VLOG
(
3
)
<<
"filter algo: "
<<
filter_result
.
algo
<<
", time "
search2
::
GetWorkspaceSize
(
args2
,
filter_algo
));
<<
filter_result
.
time
;
workspace_size_w
=
std
::
max
(
workspace_size_w
,
search2
::
GetWorkspaceSize
(
args2
,
filter_result
.
algo
));
#endif
#endif
}
}
...
@@ -439,7 +440,7 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -439,7 +440,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
wdesc
.
desc
(),
args1
.
wdesc
.
desc
(),
filter_data
,
filter_data
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
data_
algo
,
bwd_result
.
algo
,
&
beta
,
&
beta
,
args1
.
idesc
.
desc
(),
args1
.
idesc
.
desc
(),
temp_tensor_data
,
temp_tensor_data
,
...
@@ -471,7 +472,7 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -471,7 +472,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
wdesc
.
desc
(),
args1
.
wdesc
.
desc
(),
filter_data
,
filter_data
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
data_
algo
,
bwd_result
.
algo
,
&
beta
,
&
beta
,
args1
.
idesc
.
desc
(),
args1
.
idesc
.
desc
(),
transformed_input_grad_data
,
transformed_input_grad_data
,
...
@@ -494,7 +495,7 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -494,7 +495,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args1
.
odesc
.
desc
(),
args1
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
output_grad_data
+
i
*
group_offset_out
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
data_
algo
,
bwd_result
.
algo
,
cudnn_workspace_ptr
,
cudnn_workspace_ptr
,
workspace_size_d
,
workspace_size_d
,
&
beta
,
&
beta
,
...
@@ -554,7 +555,7 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -554,7 +555,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args2
.
idesc
.
desc
(),
args2
.
idesc
.
desc
(),
input_data
,
input_data
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
&
beta
,
&
beta
,
args2
.
wdesc
.
desc
(),
args2
.
wdesc
.
desc
(),
filter_grad_data
,
filter_grad_data
,
...
@@ -575,7 +576,7 @@ void ConvCudnnGradKernel(const Context& ctx,
...
@@ -575,7 +576,7 @@ void ConvCudnnGradKernel(const Context& ctx,
args2
.
odesc
.
desc
(),
args2
.
odesc
.
desc
(),
output_grad_data
+
i
*
group_offset_out
,
output_grad_data
+
i
*
group_offset_out
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
cudnn_workspace_ptr
,
cudnn_workspace_ptr
,
workspace_size_w
,
workspace_size_w
,
&
beta_filter
,
&
beta_filter
,
...
...
paddle/phi/kernels/gpudnn/conv_kernel.cu
浏览文件 @
b4adbe5c
...
@@ -18,7 +18,6 @@
...
@@ -18,7 +18,6 @@
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/fluid/framework/eigen.h"
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/operators/conv_miopen_helper.h"
#include "paddle/fluid/operators/conv_miopen_helper.h"
#else
#else
...
@@ -68,7 +67,6 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -68,7 +67,6 @@ void ConvCudnnKernel(const Context& ctx,
"FLAGS_cudnn_deterministic True at same time."
));
"FLAGS_cudnn_deterministic True at same time."
));
const
bool
channel_last
=
(
data_format
==
"NHWC"
||
data_format
==
"NDHWC"
);
const
bool
channel_last
=
(
data_format
==
"NHWC"
||
data_format
==
"NDHWC"
);
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
auto
dtype
=
paddle
::
platform
::
CudnnDataType
<
T
>::
type
;
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
...
@@ -309,17 +307,17 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -309,17 +307,17 @@ void ConvCudnnKernel(const Context& ctx,
size_t
workspace_size
=
0
;
// final workspace to allocate.
size_t
workspace_size
=
0
;
// final workspace to allocate.
// ------------------- cudnn conv algorithm ---------------------
// ------------------- cudnn conv algorithm ---------------------
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t
algo
{}
;
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
search
::
GetWorkspaceSize
(
args
);
workspace_size
=
search
::
GetWorkspaceSize
(
args
);
algo
=
search
::
Find
<
T
>
(
fwd_result
.
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
args
,
exhaustive_search
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
cudnnConvolutionFwdAlgo_t
algo
{}
;
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
using
search
=
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
algo
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
deterministic
,
ctx
);
fwd_result
=
search
::
Find
<
T
>
(
args
,
exhaustive_search
,
deterministic
,
ctx
);
workspace_size
=
search
::
GetWorkspaceSize
(
args
,
algo
);
workspace_size
=
search
::
GetWorkspaceSize
(
args
,
fwd_result
.
algo
);
#endif
#endif
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
...
@@ -328,7 +326,7 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -328,7 +326,7 @@ void ConvCudnnKernel(const Context& ctx,
// in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
// in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\
// FWD_ALGO_IMPLICIT_GEMM manually.
// FWD_ALGO_IMPLICIT_GEMM manually.
if
(
groups
>
1
)
{
if
(
groups
>
1
)
{
algo
=
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
0
);
fwd_result
.
algo
=
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
0
);
}
}
#endif
#endif
...
@@ -352,7 +350,7 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -352,7 +350,7 @@ void ConvCudnnKernel(const Context& ctx,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
filter_data
,
filter_data
,
args
.
cdesc
.
desc
(),
args
.
cdesc
.
desc
(),
algo
,
fwd_result
.
algo
,
&
beta
,
&
beta
,
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
output_data
,
output_data
,
...
@@ -373,7 +371,7 @@ void ConvCudnnKernel(const Context& ctx,
...
@@ -373,7 +371,7 @@ void ConvCudnnKernel(const Context& ctx,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
filter_data
+
i
*
group_offset_filter
,
filter_data
+
i
*
group_offset_filter
,
args
.
cdesc
.
desc
(),
args
.
cdesc
.
desc
(),
algo
,
fwd_result
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
...
paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
浏览文件 @
b4adbe5c
...
@@ -188,11 +188,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -188,11 +188,13 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
dtype
};
dtype
};
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
miopenConvFwdAlgorithm_t
data_algo
{};
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
miopenConvBwdWeightsAlgorithm_t
filter_algo
{};
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
filter_result
;
#else
#else
cudnnConvolutionFwdAlgo_t
data_algo
{};
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
cudnnConvolutionBwdFilterAlgo_t
filter_algo
{};
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
filter_result
;
#endif
#endif
auto
layout_tensor
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
auto
layout_tensor
=
paddle
::
platform
::
GetCudnnTensorFormat
(
layout
);
...
@@ -218,14 +220,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -218,14 +220,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
using
search1
=
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
workspace_size
=
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
));
data_
algo
=
fwd_result
.
algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search1
=
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
data_algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
fwd_result
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
std
::
max
(
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
data_
algo
));
workspace_size
,
search1
::
GetWorkspaceSize
(
args1
,
fwd_result
.
algo
));
#endif
#endif
}
}
...
@@ -245,14 +247,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -245,14 +247,14 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
using
search2
=
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
filter_algo
=
filter_
result
.
algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search2
=
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_
algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
filter_
result
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_
algo
));
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
filter_result
.
algo
));
#endif
#endif
}
}
...
@@ -278,7 +280,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -278,7 +280,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args1
.
wdesc
.
desc
(),
args1
.
wdesc
.
desc
(),
filter_data
+
filter_offset
*
g
,
filter_data
+
filter_offset
*
g
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
data_
algo
,
fwd_result
.
algo
,
&
beta
,
&
beta
,
args1
.
odesc
.
desc
(),
args1
.
odesc
.
desc
(),
dx_data
+
x_offset
*
g
,
dx_data
+
x_offset
*
g
,
...
@@ -295,7 +297,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -295,7 +297,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args1
.
wdesc
.
desc
(),
args1
.
wdesc
.
desc
(),
filter_data
+
filter_offset
*
g
,
filter_data
+
filter_offset
*
g
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
data_
algo
,
fwd_result
.
algo
,
cudnn_workspace
,
cudnn_workspace
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
@@ -338,7 +340,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -338,7 +340,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args2
.
idesc
.
desc
(),
args2
.
idesc
.
desc
(),
dout_data
+
dout_offset
*
g
,
dout_data
+
dout_offset
*
g
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
&
beta
,
&
beta
,
args2
.
wdesc
.
desc
(),
args2
.
wdesc
.
desc
(),
dfilter_data
+
filter_offset
*
g
,
dfilter_data
+
filter_offset
*
g
,
...
@@ -355,7 +357,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
...
@@ -355,7 +357,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
args2
.
odesc
.
desc
(),
args2
.
odesc
.
desc
(),
x_data
+
x_offset
*
g
,
x_data
+
x_offset
*
g
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
cudnn_workspace
,
cudnn_workspace
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
@@ -653,22 +655,17 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -653,22 +655,17 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
dilations_
,
dilations_
,
dtype
};
dtype
};
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
miopenConvBwdDataAlgorithm_t
bwd_algo1
=
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result1
;
static_cast
<
miopenConvBwdDataAlgorithm_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result2
;
miopenConvBwdDataAlgorithm_t
bwd_algo2
=
paddle
::
operators
::
SearchResult
<
miopenConvBwdWeightsAlgorithm_t
>
static_cast
<
miopenConvBwdDataAlgorithm_t
>
(
0
);
filter_result
;
miopenConvFwdAlgorithm_t
data_algo
=
static_cast
<
miopenConvFwdAlgorithm_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
miopenConvFwdAlgorithm_t
>
fwd_result
;
miopenConvBwdWeightsAlgorithm_t
filter_algo
=
static_cast
<
miopenConvBwdWeightsAlgorithm_t
>
(
0
);
#else
#else
cudnnConvolutionBwdDataAlgo_t
bwd_algo1
=
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result1
;
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
0
);
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result2
;
cudnnConvolutionBwdDataAlgo_t
bwd_algo2
=
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdFilterAlgo_t
>
static_cast
<
cudnnConvolutionBwdDataAlgo_t
>
(
0
);
filter_result
;
cudnnConvolutionFwdAlgo_t
data_algo
=
paddle
::
operators
::
SearchResult
<
cudnnConvolutionFwdAlgo_t
>
fwd_result
;
static_cast
<
cudnnConvolutionFwdAlgo_t
>
(
0
);
cudnnConvolutionBwdFilterAlgo_t
filter_algo
=
static_cast
<
cudnnConvolutionBwdFilterAlgo_t
>
(
0
);
#endif
#endif
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
GPUDNNDataLayout
::
kNCHW
);
auto
layout
=
paddle
::
platform
::
GetCudnnTensorFormat
(
GPUDNNDataLayout
::
kNCHW
);
...
@@ -696,13 +693,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -696,13 +693,13 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using
search1
=
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
);
bwd_
algo1
=
bwd_
result1
.
algo
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search1
=
using
search1
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_
algo
1
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
bwd_
result
1
=
search1
::
Find
<
T
>
(
args1
,
false
,
deterministic
,
ctx
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_
algo1
);
workspace_size
=
search1
::
GetWorkspaceSize
(
args1
,
bwd_
result1
.
algo
);
#endif
#endif
ddfilter_
=
ddfilter
.
data
<
T
>
();
ddfilter_
=
ddfilter
.
data
<
T
>
();
...
@@ -720,14 +717,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -720,14 +717,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using
search2
=
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
workspace_size
=
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
));
bwd_
algo2
=
bwd_
result2
.
algo
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search2
=
using
search2
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
bwd_
algo
2
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
bwd_
result
2
=
search2
::
Find
<
T
>
(
args2
,
false
,
deterministic
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
std
::
max
(
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_algo2
));
workspace_size
,
search2
::
GetWorkspaceSize
(
args2
,
bwd_result2
.
algo
));
#endif
#endif
}
}
...
@@ -736,9 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -736,9 +733,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args3
.
handle
=
handle
;
args3
.
handle
=
handle
;
args3
.
idesc
.
set
(
transformed_dout
,
iwo_group
);
args3
.
idesc
.
set
(
transformed_dout
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dfilter
,
layout
,
iwo_group
);
args3
.
wdesc
.
set
(
*
dfilter
,
layout
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_ddx_channel
,
iwo_group
);
args3
.
odesc
.
set
(
transformed_ddx_channel
,
iwo_group
);
args3
.
cdesc
.
set
(
dtype
,
args3
.
cdesc
.
set
(
dtype
,
padding_common
,
padding_common
,
strides
,
strides
,
...
@@ -749,14 +744,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -749,14 +744,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using
search3
=
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdWeightsAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
workspace_size
=
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
));
filter_algo
=
filter_
result
.
algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
workspace_size
,
ctx
);
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search3
=
using
search3
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdFilterAlgoPerf_t
>
;
filter_
algo
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
ctx
);
filter_
result
=
search3
::
Find
<
T
>
(
args3
,
false
,
deterministic
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
std
::
max
(
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_
algo
));
workspace_size
,
search3
::
GetWorkspaceSize
(
args3
,
filter_result
.
algo
));
#endif
#endif
}
}
...
@@ -777,14 +772,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -777,14 +772,14 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
using
search4
=
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvFwdAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
workspace_size
=
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
));
data_
algo
=
fwd_result
.
algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
workspace_size
,
ctx
);
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
using
search4
=
using
search4
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionFwdAlgoPerf_t
>
;
data_algo
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
ctx
);
fwd_result
=
search4
::
Find
<
T
>
(
args4
,
false
,
deterministic
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
std
::
max
(
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
data_
algo
));
workspace_size
,
search4
::
GetWorkspaceSize
(
args4
,
fwd_result
.
algo
));
#endif
#endif
}
}
...
@@ -831,7 +826,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -831,7 +826,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args1
.
wdesc
.
desc
(),
args1
.
wdesc
.
desc
(),
filter_
+
i
*
group_offset_filter
,
filter_
+
i
*
group_offset_filter
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
bwd_
algo1
,
bwd_
result1
.
algo
,
&
beta
,
&
beta
,
args1
.
idesc
.
desc
(),
args1
.
idesc
.
desc
(),
transformed_ddout_channel_
+
i
*
group_offset_out
,
transformed_ddout_channel_
+
i
*
group_offset_out
,
...
@@ -850,7 +845,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -850,7 +845,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args1
.
odesc
.
desc
(),
args1
.
odesc
.
desc
(),
ddx_
+
i
*
group_offset_in
,
ddx_
+
i
*
group_offset_in
,
args1
.
cdesc
.
desc
(),
args1
.
cdesc
.
desc
(),
bwd_
algo1
,
bwd_
result1
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
@@ -877,7 +872,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -877,7 +872,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args2
.
wdesc
.
desc
(),
args2
.
wdesc
.
desc
(),
ddfilter_
+
i
*
group_offset_filter
,
ddfilter_
+
i
*
group_offset_filter
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
bwd_
algo2
,
bwd_
result2
.
algo
,
&
beta
,
&
beta
,
args2
.
idesc
.
desc
(),
args2
.
idesc
.
desc
(),
conv_x_ddfilter_data
+
i
*
group_offset_out
,
conv_x_ddfilter_data
+
i
*
group_offset_out
,
...
@@ -908,7 +903,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -908,7 +903,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args2
.
odesc
.
desc
(),
args2
.
odesc
.
desc
(),
x_
+
i
*
group_offset_in
,
x_
+
i
*
group_offset_in
,
args2
.
cdesc
.
desc
(),
args2
.
cdesc
.
desc
(),
bwd_
algo2
,
bwd_
result2
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
alpha
,
&
alpha
,
...
@@ -964,7 +959,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -964,7 +959,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args3
.
idesc
.
desc
(),
args3
.
idesc
.
desc
(),
transformed_dout_channel_
+
i
*
group_offset_out
,
transformed_dout_channel_
+
i
*
group_offset_out
,
args3
.
cdesc
.
desc
(),
args3
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
&
beta
,
&
beta
,
args3
.
wdesc
.
desc
(),
args3
.
wdesc
.
desc
(),
dfilter_
+
i
*
group_offset_filter
,
dfilter_
+
i
*
group_offset_filter
,
...
@@ -983,7 +978,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -983,7 +978,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args3
.
odesc
.
desc
(),
args3
.
odesc
.
desc
(),
ddx_
+
i
*
group_offset_in
,
ddx_
+
i
*
group_offset_in
,
args3
.
cdesc
.
desc
(),
args3
.
cdesc
.
desc
(),
filter_algo
,
filter_
result
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
@@ -1009,7 +1004,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -1009,7 +1004,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args4
.
wdesc
.
desc
(),
args4
.
wdesc
.
desc
(),
ddfilter_
+
i
*
group_offset_filter
,
ddfilter_
+
i
*
group_offset_filter
,
args4
.
cdesc
.
desc
(),
args4
.
cdesc
.
desc
(),
data_
algo
,
fwd_result
.
algo
,
&
beta
,
&
beta
,
args4
.
odesc
.
desc
(),
args4
.
odesc
.
desc
(),
transformed_dx_
+
i
*
group_offset_in
,
transformed_dx_
+
i
*
group_offset_in
,
...
@@ -1028,7 +1023,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
...
@@ -1028,7 +1023,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
args4
.
wdesc
.
desc
(),
args4
.
wdesc
.
desc
(),
ddfilter_
+
i
*
group_offset_filter
,
ddfilter_
+
i
*
group_offset_filter
,
args4
.
cdesc
.
desc
(),
args4
.
cdesc
.
desc
(),
data_
algo
,
fwd_result
.
algo
,
workspace_ptr
,
workspace_ptr
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
...
paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
浏览文件 @
b4adbe5c
...
@@ -217,16 +217,19 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
...
@@ -217,16 +217,19 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
c_groups
);
c_groups
);
#ifdef PADDLE_WITH_HIP
#ifdef PADDLE_WITH_HIP
paddle
::
operators
::
SearchResult
<
miopenConvBwdDataAlgorithm_t
>
bwd_result
;
using
search
=
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
miopenConvBwdDataAlgorithm_t
>
;
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
));
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
));
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
workspace_size
,
ctx
);
bwd_result
.
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
workspace_size
,
ctx
);
#else
#else
paddle
::
operators
::
SearchResult
<
cudnnConvolutionBwdDataAlgo_t
>
bwd_result
;
using
search
=
using
search
=
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
paddle
::
operators
::
SearchAlgorithm
<
cudnnConvolutionBwdDataAlgoPerf_t
>
;
algo
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
ctx
);
bwd_result
=
search
::
Find
<
T
>
(
args
,
false
,
deterministic
,
ctx
);
workspace_size
=
workspace_size
=
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
algo
));
std
::
max
(
workspace_size
,
search
::
GetWorkspaceSize
(
args
,
bwd_result
.
algo
));
#endif
#endif
// ------------------- cudnn conv transpose forward ---------------------
// ------------------- cudnn conv transpose forward ---------------------
...
@@ -247,7 +250,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
...
@@ -247,7 +250,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
args
.
wdesc
.
desc
(),
args
.
wdesc
.
desc
(),
filter_data
+
filter_offset
*
g
,
filter_data
+
filter_offset
*
g
,
args
.
cdesc
.
desc
(),
args
.
cdesc
.
desc
(),
algo
,
bwd_result
.
algo
,
&
beta
,
&
beta
,
args
.
idesc
.
desc
(),
args
.
idesc
.
desc
(),
transformed_out_data
+
out_offset
*
g
,
transformed_out_data
+
out_offset
*
g
,
...
@@ -264,7 +267,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
...
@@ -264,7 +267,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
args
.
odesc
.
desc
(),
args
.
odesc
.
desc
(),
x_data
+
x_offset
*
g
,
x_data
+
x_offset
*
g
,
args
.
cdesc
.
desc
(),
args
.
cdesc
.
desc
(),
algo
,
bwd_result
.
algo
,
cudnn_workspace
,
cudnn_workspace
,
workspace_size
,
workspace_size
,
&
beta
,
&
beta
,
...
...
paddle/phi/kernels/impl/conv_cudnn_impl.h
浏览文件 @
b4adbe5c
...
@@ -36,7 +36,7 @@
...
@@ -36,7 +36,7 @@
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
DECLARE_bool
(
cudnn_deterministic
);
DECLARE_bool
(
cudnn_deterministic
);
DECLARE_
u
int64
(
conv_workspace_size_limit
);
DECLARE_int64
(
conv_workspace_size_limit
);
DECLARE_bool
(
cudnn_exhaustive_search
);
DECLARE_bool
(
cudnn_exhaustive_search
);
namespace
phi
{
namespace
phi
{
...
...
python/paddle/fluid/tests/unittests/test_switch_autotune.py
浏览文件 @
b4adbe5c
...
@@ -14,7 +14,7 @@
...
@@ -14,7 +14,7 @@
import
paddle
import
paddle
import
unittest
import
unittest
import
numpy
import
numpy
as
np
class
SimpleNet
(
paddle
.
nn
.
Layer
):
class
SimpleNet
(
paddle
.
nn
.
Layer
):
...
@@ -27,6 +27,7 @@ class SimpleNet(paddle.nn.Layer):
...
@@ -27,6 +27,7 @@ class SimpleNet(paddle.nn.Layer):
def
train_dygraph
(
net
,
data
):
def
train_dygraph
(
net
,
data
):
data
.
stop_gradient
=
False
out
=
net
(
data
)
out
=
net
(
data
)
loss
=
paddle
.
mean
(
out
)
loss
=
paddle
.
mean
(
out
)
adam
=
paddle
.
optimizer
.
Adam
(
parameters
=
net
.
parameters
())
adam
=
paddle
.
optimizer
.
Adam
(
parameters
=
net
.
parameters
())
...
@@ -36,6 +37,7 @@ def train_dygraph(net, data):
...
@@ -36,6 +37,7 @@ def train_dygraph(net, data):
def
static_program
(
net
,
data
):
def
static_program
(
net
,
data
):
data
.
stop_gradient
=
False
out
=
net
(
data
)
out
=
net
(
data
)
loss
=
paddle
.
mean
(
out
)
loss
=
paddle
.
mean
(
out
)
adam
=
paddle
.
optimizer
.
Adam
()
adam
=
paddle
.
optimizer
.
Adam
()
...
@@ -44,48 +46,63 @@ def static_program(net, data):
...
@@ -44,48 +46,63 @@ def static_program(net, data):
class
TestAutoTune
(
unittest
.
TestCase
):
class
TestAutoTune
(
unittest
.
TestCase
):
def
set_flags
(
self
,
enable_autotune
):
if
paddle
.
is_compiled_with_cuda
():
if
enable_autotune
:
paddle
.
set_flags
({
'FLAGS_conv_workspace_size_limit'
:
-
1
})
else
:
paddle
.
set_flags
({
'FLAGS_conv_workspace_size_limit'
:
512
})
def
get_flags
(
self
,
name
):
res
=
paddle
.
get_flags
(
name
)
return
res
[
name
]
def
get_expected_res
(
self
,
step_id
,
enable_autotune
):
expected_res
=
{
"step_id"
:
step_id
,
"cache_size"
:
0
,
"cache_hit_rate"
:
0
}
if
paddle
.
is_compiled_with_cuda
():
# Total 3 * num_iters cache accesses, only iter 2 hits the cache.
if
enable_autotune
and
step_id
>=
1
:
expected_res
[
"cache_size"
]
=
3
if
enable_autotune
and
step_id
==
2
:
expected_res
[
"cache_hit_rate"
]
=
np
.
round
(
float
(
3
)
/
float
(
9
),
5
)
return
expected_res
def
test_autotune
(
self
):
def
test_autotune
(
self
):
paddle
.
fluid
.
core
.
disable_autotune
()
paddle
.
fluid
.
core
.
disable_autotune
()
status
=
paddle
.
fluid
.
core
.
autotune_status
()
self
.
assertEqual
(
self
.
get_flags
(
"FLAGS_use_autotune"
),
False
)
self
.
assertEqual
(
status
[
"use_autotune"
],
False
)
paddle
.
fluid
.
core
.
enable_autotune
()
paddle
.
fluid
.
core
.
enable_autotune
()
status
=
paddle
.
fluid
.
core
.
autotune_status
()
self
.
assertEqual
(
self
.
get_flags
(
"FLAGS_use_autotune"
),
True
)
self
.
assertEqual
(
status
[
"use_autotune"
],
True
)
def
check_status
(
self
,
expected_res
):
def
check_status
(
self
,
expected_res
):
status
=
paddle
.
fluid
.
core
.
autotune_status
()
status
=
paddle
.
fluid
.
core
.
autotune_status
()
for
key
in
status
.
keys
():
for
key
in
status
.
keys
():
self
.
assertEqual
(
status
[
key
],
expected_res
[
key
])
if
key
==
"cache_hit_rate"
:
v
=
np
.
round
(
status
[
key
],
5
)
else
:
v
=
status
[
key
]
self
.
assertEqual
(
v
,
expected_res
[
key
])
class
TestDygraphAutoTuneStatus
(
TestAutoTune
):
class
TestDygraphAutoTuneStatus
(
TestAutoTune
):
def
run_program
(
self
,
enable_autotune
):
def
run_program
(
self
,
enable_autotune
):
self
.
set_flags
(
enable_autotune
)
if
enable_autotune
:
if
enable_autotune
:
paddle
.
fluid
.
core
.
enable_autotune
()
paddle
.
fluid
.
core
.
enable_autotune
()
else
:
else
:
paddle
.
fluid
.
core
.
disable_autotune
()
paddle
.
fluid
.
core
.
disable_autotune
()
paddle
.
fluid
.
core
.
autotune_range
(
1
,
2
)
paddle
.
fluid
.
core
.
set_
autotune_range
(
1
,
2
)
x_var
=
paddle
.
uniform
((
1
,
1
,
8
,
8
),
dtype
=
'float32'
,
min
=-
1.
,
max
=
1.
)
x_var
=
paddle
.
uniform
((
1
,
1
,
8
,
8
),
dtype
=
'float32'
,
min
=-
1.
,
max
=
1.
)
net
=
SimpleNet
()
net
=
SimpleNet
()
for
i
in
range
(
3
):
for
i
in
range
(
3
):
train_dygraph
(
net
,
x_var
)
train_dygraph
(
net
,
x_var
)
if
i
>=
1
and
i
<
2
:
expected_res
=
self
.
get_expected_res
(
i
,
enable_autotune
)
expected_res
=
{
self
.
check_status
(
expected_res
)
"step_id"
:
i
,
"use_autotune"
:
enable_autotune
,
"cache_size"
:
0
,
"cache_hit_rate"
:
0
}
self
.
check_status
(
expected_res
)
else
:
expected_res
=
{
"step_id"
:
i
,
"use_autotune"
:
False
,
"cache_size"
:
0
,
"cache_hit_rate"
:
0
}
self
.
check_status
(
expected_res
)
def
func_enable_autotune
(
self
):
def
func_enable_autotune
(
self
):
self
.
run_program
(
enable_autotune
=
True
)
self
.
run_program
(
enable_autotune
=
True
)
...
@@ -107,59 +124,45 @@ class TestDygraphAutoTuneStatus(TestAutoTune):
...
@@ -107,59 +124,45 @@ class TestDygraphAutoTuneStatus(TestAutoTune):
class
TestStaticAutoTuneStatus
(
TestAutoTune
):
class
TestStaticAutoTuneStatus
(
TestAutoTune
):
def
run_program
(
self
,
enable_autotune
):
def
run_program
(
self
,
enable_autotune
):
paddle
.
enable_static
()
paddle
.
enable_static
()
if
enable_autotune
:
paddle
.
fluid
.
core
.
enable_autotune
()
else
:
paddle
.
fluid
.
core
.
disable_autotune
()
paddle
.
fluid
.
core
.
autotune_range
(
1
,
2
)
data_shape
=
[
1
,
1
,
8
,
8
]
data_shape
=
[
1
,
1
,
8
,
8
]
data
=
paddle
.
static
.
data
(
name
=
'X'
,
shape
=
data_shape
,
dtype
=
'float32'
)
main_program
=
paddle
.
static
.
Program
()
net
=
SimpleNet
()
startup_program
=
paddle
.
static
.
Program
()
loss
=
static_program
(
net
,
data
)
with
paddle
.
static
.
program_guard
(
main_program
,
startup_program
):
data
=
paddle
.
static
.
data
(
name
=
'X'
,
shape
=
data_shape
,
dtype
=
'float32'
)
net
=
SimpleNet
()
loss
=
static_program
(
net
,
data
)
place
=
paddle
.
CUDAPlace
(
0
)
if
paddle
.
fluid
.
core
.
is_compiled_with_cuda
(
place
=
paddle
.
CUDAPlace
(
0
)
if
paddle
.
fluid
.
core
.
is_compiled_with_cuda
(
)
else
paddle
.
CPUPlace
()
)
else
paddle
.
CPUPlace
()
exe
=
paddle
.
static
.
Executor
(
place
)
exe
=
paddle
.
static
.
Executor
(
place
)
exe
.
run
(
paddle
.
static
.
default_startup_program
())
exe
.
run
(
startup_program
)
x
=
numpy
.
random
.
random
(
size
=
data_shape
).
astype
(
'float32'
)
x
=
np
.
random
.
random
(
size
=
data_shape
).
astype
(
'float32'
)
self
.
set_flags
(
enable_autotune
)
if
enable_autotune
:
paddle
.
fluid
.
core
.
enable_autotune
()
else
:
paddle
.
fluid
.
core
.
disable_autotune
()
paddle
.
fluid
.
core
.
set_autotune_range
(
1
,
2
)
for
i
in
range
(
3
):
for
i
in
range
(
3
):
exe
.
run
(
feed
=
{
'X'
:
x
},
fetch_list
=
[
loss
])
exe
.
run
(
program
=
main_program
,
feed
=
{
'X'
:
x
},
fetch_list
=
[
loss
])
status
=
paddle
.
fluid
.
core
.
autotune_status
()
status
=
paddle
.
fluid
.
core
.
autotune_status
()
# In static mode, the startup_program will run at first.
expected_res
=
self
.
get_expected_res
(
i
,
enable_autotune
)
# The expected step_id will be increased by 1.
self
.
check_status
(
expected_res
)
if
i
>=
0
and
i
<
1
:
expected_res
=
{
"step_id"
:
i
+
1
,
"use_autotune"
:
enable_autotune
,
"cache_size"
:
0
,
"cache_hit_rate"
:
0
}
self
.
check_status
(
expected_res
)
else
:
expected_res
=
{
"step_id"
:
i
+
1
,
"use_autotune"
:
False
,
"cache_size"
:
0
,
"cache_hit_rate"
:
0
}
self
.
check_status
(
expected_res
)
paddle
.
disable_static
()
paddle
.
disable_static
()
def
func_enable_autotune
(
self
):
def
func_enable_autotune
(
self
):
self
.
run_program
(
enable_autotune
=
True
)
self
.
run_program
(
enable_autotune
=
True
)
def
test_enable_autotune
(
self
):
def
test_enable_autotune
(
self
):
with
paddle
.
fluid
.
framework
.
_test_eager_guard
():
self
.
func_enable_autotune
()
self
.
func_enable_autotune
()
self
.
func_enable_autotune
()
def
func_disable_autotune
(
self
):
def
func_disable_autotune
(
self
):
self
.
run_program
(
enable_autotune
=
False
)
self
.
run_program
(
enable_autotune
=
False
)
def
test_disable_autotune
(
self
):
def
test_disable_autotune
(
self
):
with
paddle
.
fluid
.
framework
.
_test_eager_guard
():
self
.
func_disable_autotune
()
self
.
func_disable_autotune
()
self
.
func_disable_autotune
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录