Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
767050d9
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
767050d9
编写于
9月 29, 2021
作者:
Y
Yiqun Liu
提交者:
GitHub
9月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement the grad and enhance the cache of norm_convolution fusion ops. (#36168)
上级
b3d2dc7b
变更
4
展开全部
显示空白变更内容
内联
并排
Showing
4 changed file
with
630 addition
and
253 deletion
+630
-253
paddle/fluid/framework/operator_kernel_configs.h
paddle/fluid/framework/operator_kernel_configs.h
+2
-0
paddle/fluid/operators/fused/cudnn_fusion_helper.h
paddle/fluid/operators/fused/cudnn_fusion_helper.h
+34
-31
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+276
-81
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+318
-141
未找到文件。
paddle/fluid/framework/operator_kernel_configs.h
浏览文件 @
767050d9
...
@@ -15,8 +15,10 @@ limitations under the License. */
...
@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once
#pragma once
#include <algorithm>
#include <algorithm>
#include <mutex>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "glog/logging.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
...
paddle/fluid/operators/fused/cudnn_fusion_helper.h
浏览文件 @
767050d9
...
@@ -14,10 +14,8 @@ limitations under the License. */
...
@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once
#pragma once
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -41,12 +39,9 @@ class CudnnFusionOp {
...
@@ -41,12 +39,9 @@ class CudnnFusionOp {
}
}
~
CudnnFusionOp
()
{
~
CudnnFusionOp
()
{
// New 'fused op' descriptor destruction
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
);
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
));
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
));
}
}
// Execute fused op
// Execute fused op
...
@@ -121,41 +116,49 @@ class CudnnFusionOp {
...
@@ -121,41 +116,49 @@ class CudnnFusionOp {
// Get the workspace, which is required before Execute().
// Get the workspace, which is required before Execute().
size_t
GetWorkspaceSizeInBytes
(
cudnnHandle_t
cudnn_handle
)
{
size_t
GetWorkspaceSizeInBytes
(
cudnnHandle_t
cudnn_handle
)
{
size_t
workspace_bytes
=
0U
;
if
(
!
plan_created_
)
{
workspace_bytes_
=
0U
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnMakeFusedOpsPlan
(
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnMakeFusedOpsPlan
(
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes
));
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes_
));
plan_created_
=
true
;
plan_created_
=
true
;
return
workspace_bytes
;
}
return
workspace_bytes_
;
}
}
private:
private:
bool
plan_created_
;
bool
plan_created_
;
size_t
workspace_bytes_
;
cudnnFusedOpsPlan_t
op_
;
cudnnFusedOpsPlan_t
op_
;
cudnnFusedOpsConstParamPack_t
op_const_params_
;
cudnnFusedOpsConstParamPack_t
op_const_params_
;
cudnnFusedOpsVariantParamPack_t
op_variant_params_
;
cudnnFusedOpsVariantParamPack_t
op_variant_params_
;
};
};
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>
&
shape
)
{
class
CudnnFusionOpCache
{
if
(
shape
.
size
()
<
1
)
{
public:
return
{};
static
CudnnFusionOpCache
&
Instance
()
{
}
static
CudnnFusionOpCache
instance
;
int
dim
=
static_cast
<
int
>
(
shape
.
size
());
return
instance
;
std
::
vector
<
int
>
pro_shape
(
shape
);
}
std
::
vector
<
int
>
strides
(
dim
);
int
temp
=
pro_shape
[
1
];
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetForward
()
{
pro_shape
.
erase
(
pro_shape
.
begin
()
+
1
);
return
&
forward_cache_
;
pro_shape
.
push_back
(
temp
);
}
strides
.
back
()
=
1
;
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetBackward
()
{
for
(
int
i
=
dim
-
2
;
i
>=
0
;
--
i
)
{
return
&
backward_cache_
;
strides
[
i
]
=
strides
[
i
+
1
]
*
pro_shape
[
i
+
1
];
}
}
strides
.
pop_back
();
private:
strides
.
insert
(
strides
.
begin
()
+
1
,
1
);
CudnnFusionOpCache
()
{}
return
strides
;
~
CudnnFusionOpCache
()
{
}
// Need to delete the memory of cache.
}
static
inline
int64_t
AlignUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
CudnnFusionOpCache
(
const
CudnnFusionOpCache
&
)
{}
private:
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
forward_cache_
;
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
backward_cache_
;
};
#endif // CUDNN_VERSION >= 8000
#endif // CUDNN_VERSION >= 8000
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
浏览文件 @
767050d9
...
@@ -15,125 +15,320 @@ limitations under the License. */
...
@@ -15,125 +15,320 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
namespace
dynload
=
platform
::
dynload
;
namespace
dynload
=
platform
::
dynload
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
#if CUDNN_VERSION >= 8000
#if CUDNN_VERSION >= 8000
static
size_t
RoundUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
*
b
;
}
template
<
typename
T
>
template
<
typename
T
>
class
CudnnNormConvolutionOp
{
struct
NormConvolutionArgs
{
public:
NormConvolutionArgs
()
{
CudnnNormConvolutionOp
()
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
:
fwd_op_
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
)
{}
format
=
CUDNN_TENSOR_NHWC
;
~
CudnnNormConvolutionOp
()
{}
compute_type
=
platform
::
CudnnDataType
<
float
>::
type
;
}
void
Set
(
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
int
padding
,
int
stride
,
int
dilation
,
int
group
)
{
PADDLE_ENFORCE_EQ
(
input_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of input_shape is expected to 4. But recieved "
"input_shape's size is %d, input_shape is [%s]."
,
input_shape
.
size
(),
framework
::
make_ddim
(
input_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of filter_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
filter_shape
.
size
(),
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
[
1
]
==
filter_shape
[
2
]
&&
(
filter_shape
[
1
]
==
1
||
filter_shape
[
1
]
==
3
),
true
,
platform
::
errors
::
InvalidArgument
(
"The filter_shape is expected to store as nhwc, and "
"h = w = 1 or 3. But recieved filter_shape is [%s]."
,
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
output_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of output_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
output_shape
.
size
(),
framework
::
make_ddim
(
output_shape
)));
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
in_dims
.
push_back
(
input_shape
[
i
]);
}
for
(
size_t
i
=
0
;
i
<
filter_shape
.
size
();
++
i
)
{
filter_dims
.
push_back
(
filter_shape
[
i
]);
}
paddings
=
{
padding
,
padding
};
strides
=
{
stride
,
stride
};
dilations
=
{
dilation
,
dilation
};
in_desc
.
set
(
input_shape
,
format
,
dtype
);
filter_desc
.
set
(
filter_shape
,
format
,
dtype
,
group
);
out_desc
.
set
(
output_shape
,
format
,
dtype
);
int
output_channel
=
filter_shape
[
0
];
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
out_stats_desc
.
set
(
stats_shape
,
format
,
compute_type
);
conv_desc
.
set
(
dtype
,
paddings
,
strides
,
dilations
,
false
,
group
);
}
void
Init
(
const
platform
::
CUDADeviceContext
&
ctx
,
cudnnDataType_t
dtype
;
cudnnTensorFormat_t
format
;
cudnnDataType_t
compute_type
;
std
::
vector
<
int64_t
>
in_dims
;
std
::
vector
<
int64_t
>
filter_dims
;
std
::
vector
<
int
>
strides
;
std
::
vector
<
int
>
paddings
;
std
::
vector
<
int
>
dilations
;
platform
::
TensorDescriptor
in_desc
;
platform
::
FilterDescriptor
filter_desc
;
platform
::
TensorDescriptor
out_desc
;
platform
::
TensorDescriptor
out_stats_desc
;
platform
::
ConvolutionDescriptor
conv_desc
;
};
template
<
typename
T
>
class
CudnnNormConvolution
{
public:
CudnnNormConvolution
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
const
int
&
stride
,
const
int
&
dilation
,
cudnn_fwd_compute_type_
=
platform
::
CudnnDataType
<
float
>::
type
;
const
int
&
group
)
{
dtype_
=
platform
::
CudnnDataType
<
T
>::
type
;
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
format_
=
CUDNN_TENSOR_NHWC
;
dilation
,
group
);
InitDescriptors
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
pad
,
stride
,
dilate
,
group
);
GetWorkspaceSize
(
ctx
);
}
}
~
CudnnNormConvolution
()
{}
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
filter_ptr
,
T
*
output_ptr
,
float
*
sum_ptr
,
T
*
filter_ptr
,
T
*
output_ptr
,
float
*
sum_ptr
,
float
*
sum_of_squares_ptr
)
{
float
*
sum_of_squares_ptr
)
{
auto
handle
=
ctx
.
cudnn_handle
();
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
CudnnFusionOp
*
fwd_op
=
GetForwardOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
static_cast
<
int64_t
>
(
fwd_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// Set variant_param
// Set variant_param
// input ptr
// input ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
fwd_workspace_byte_
);
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
// output ptr
// output ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
workspace_handle
.
RunFunc
(
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
// workspace ptr
fwd_op
_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
// fused op execute
fwd_op
_
.
Execute
(
handle
);
fwd_op
->
Execute
(
cudnn_
handle
);
},
},
fwd_workspace_byte_
);
workspace_size
);
}
}
// TBD
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
)
{}
private:
private:
void
InitDescriptors
(
const
platform
::
CUDADeviceContext
&
ctx
,
CudnnFusionOp
*
GetForwardOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
const
std
::
vector
<
int
>
&
input_shape
,
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
const
std
::
vector
<
int
>
&
filter_shape
,
*
(
CudnnFusionOpCache
::
Instance
().
GetForward
());
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
CudnnFusionOp
*
fwd_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
fwd_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
);
// Set constant_param
// Set constant_param
fwd_op_
.
SetOpConstParamAttr
(
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_WDATA_PLACEHOLDER
,
{
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_WDATA_PLACEHOLDER
,
CUDNN_PARAM_YDATA_PLACEHOLDER
},
CUDNN_PARAM_YDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
CUDNN_PTR_16B_ALIGNED
);
fwd_op_
.
SetOpConstParamAttr
(
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_YSUM_PLACEHOLDER
,
CUDNN_PARAM_YSQSUM_PLACEHOLDER
},
{
CUDNN_PARAM_YSUM_PLACEHOLDER
,
CUDNN_PARAM_YSQSUM_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
CUDNN_PTR_16B_ALIGNED
);
std
::
vector
<
int
>
pad_vec
=
{
pad
,
pad
};
// conv desc
std
::
vector
<
int
>
stride_vec
=
{
stride
,
stride
};
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
std
::
vector
<
int
>
dilate_vec
=
{
dilate
,
dilate
};
args_
.
conv_desc
.
desc
());
int
output_channel
=
filter_shape
[
0
];
// input desc
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
args_
.
in_desc
.
desc
());
// filter desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
args_
.
filter_desc
.
desc
());
// output desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
args_
.
out_desc
.
desc
());
// output_stats desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
args_
.
out_stats_desc
.
desc
());
// batch_norm mode
fwd_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
// set conv desc
// Make cudnn fused ops plan
conv_desc_
.
set
(
dtype_
,
pad_vec
,
stride_vec
,
dilate_vec
,
false
,
group
);
fwd_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
conv_desc_
.
desc
());
return
fwd_op
;
});
return
fwd_op
;
}
// set input desc
private:
in_desc_
.
set
(
input_shape
,
format_
,
dtype_
)
;
NormConvolutionArgs
<
T
>
args_
;
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
in_desc_
.
desc
())
;
}
;
// set filter desc
template
<
typename
T
>
filter_desc_
.
set
(
filter_shape
,
format_
,
dtype_
,
group
);
class
CudnnNormConvolutionGrad
{
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
filter_desc_
.
desc
());
public:
CudnnNormConvolutionGrad
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
dgrad_algo_
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
~
CudnnNormConvolutionGrad
()
{}
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
T
*
filter_grad_ptr
,
bool
use_addto
=
false
)
{
if
(
filter_grad_ptr
)
{
BackwardFilter
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
filter_grad_ptr
);
}
if
(
input_grad_ptr
)
{
BackwardData
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
use_addto
);
}
}
private:
void
BackwardFilter
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
filter_grad_ptr
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
// set output desc
CudnnFusionOp
*
wgrad_op
=
GetBackwardFilterOp
(
ctx
);
out_desc_
.
set
(
output_shape
,
format_
,
dtype_
);
size_t
workspace_size
=
RoundUp
(
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
out_desc_
.
desc
());
static_cast
<
int64_t
>
(
wgrad_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// set output_stats desc
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
out_stats_desc_
.
set
(
stats_shape
,
format_
,
cudnn_fwd_compute_type_
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DYDATA
,
output_grad_ptr
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DWDATA
,
filter_grad_ptr
);
out_stats_desc_
.
desc
());
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
fwd_op_
.
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL
);
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
wgrad_op
->
Execute
(
cudnn_handle
);
},
workspace_size
);
}
}
void
GetWorkspaceSize
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
void
BackwardData
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
auto
handle
=
ctx
.
cudnn_handle
();
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
fwd_workspace_byte_
=
fwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
bool
use_addto
=
false
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
size_t
workspace_size
=
GetWorkspaceSizeBwdData
(
ctx
);
// Convolution dgrad followed optionally by batchnorm dgrad
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
cudnn_handle
,
&
alpha
,
args_
.
filter_desc
.
desc
(),
filter_ptr
,
args_
.
out_desc
.
desc
(),
output_grad_ptr
,
args_
.
conv_desc
.
desc
(),
dgrad_algo_
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args_
.
in_desc
.
desc
(),
input_grad_ptr
));
},
workspace_size
);
}
}
size_t
fwd_workspace_byte_
=
0
;
CudnnFusionOp
*
GetBackwardFilterOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
*
(
CudnnFusionOpCache
::
Instance
().
GetBackward
());
CudnnFusionOp
*
wgrad_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
wgrad_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD
);
wgrad_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_DYDATA_PLACEHOLDER
,
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_DWDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
// conv desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
args_
.
conv_desc
.
desc
());
// input desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
args_
.
in_desc
.
desc
());
// filter desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DWDESC
,
args_
.
filter_desc
.
desc
());
// output desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DYDESC
,
args_
.
out_desc
.
desc
());
wgrad_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
// Make cudnn fused ops plan
wgrad_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
return
wgrad_op
;
});
return
wgrad_op
;
}
cudnnDataType_t
dtype_
;
size_t
GetWorkspaceSizeBwdData
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
cudnnDataType_t
cudnn_fwd_compute_type_
;
size_t
workspace_size
=
0U
;
platform
::
TensorDescriptor
in_desc_
;
auto
handle
=
ctx
.
cudnn_handle
();
platform
::
FilterDescriptor
filter_desc_
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
TensorDescriptor
out_desc_
;
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
platform
::
TensorDescriptor
out_stats_desc_
;
handle
,
args_
.
filter_desc
.
desc
(),
args_
.
out_desc
.
desc
(),
platform
::
ConvolutionDescriptor
conv_desc_
;
args_
.
conv_desc
.
desc
(),
args_
.
in_desc
.
desc
(),
dgrad_algo_
,
cudnnTensorFormat_t
format_
;
&
workspace_size
));
return
RoundUp
(
workspace_size
,
512
);
}
CudnnFusionOp
fwd_op_
;
private:
NormConvolutionArgs
<
T
>
args_
;
cudnnConvolutionBwdDataAlgo_t
dgrad_algo_
;
};
};
#endif
#endif
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
浏览文件 @
767050d9
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录