Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
767050d9
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
767050d9
编写于
9月 29, 2021
作者:
Y
Yiqun Liu
提交者:
GitHub
9月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement the grad and enhance the cache of norm_convolution fusion ops. (#36168)
上级
b3d2dc7b
变更
4
展开全部
显示空白变更内容
内联
并排
Showing
4 changed file
with
630 addition
and
253 deletion
+630
-253
paddle/fluid/framework/operator_kernel_configs.h
paddle/fluid/framework/operator_kernel_configs.h
+2
-0
paddle/fluid/operators/fused/cudnn_fusion_helper.h
paddle/fluid/operators/fused/cudnn_fusion_helper.h
+34
-31
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+276
-81
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+318
-141
未找到文件。
paddle/fluid/framework/operator_kernel_configs.h
浏览文件 @
767050d9
...
...
@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <mutex>
#include <unordered_map>
#include <vector>
#include "glog/logging.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/operators/fused/cudnn_fusion_helper.h
浏览文件 @
767050d9
...
...
@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -41,12 +39,9 @@ class CudnnFusionOp {
}
~
CudnnFusionOp
()
{
// New 'fused op' descriptor destruction
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
));
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
);
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
);
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
);
}
// Execute fused op
...
...
@@ -121,41 +116,49 @@ class CudnnFusionOp {
// Get the workspace, which is required before Execute().
size_t
GetWorkspaceSizeInBytes
(
cudnnHandle_t
cudnn_handle
)
{
size_t
workspace_bytes
=
0U
;
if
(
!
plan_created_
)
{
workspace_bytes_
=
0U
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnMakeFusedOpsPlan
(
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes
));
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes_
));
plan_created_
=
true
;
return
workspace_bytes
;
}
return
workspace_bytes_
;
}
private:
bool
plan_created_
;
size_t
workspace_bytes_
;
cudnnFusedOpsPlan_t
op_
;
cudnnFusedOpsConstParamPack_t
op_const_params_
;
cudnnFusedOpsVariantParamPack_t
op_variant_params_
;
};
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>
&
shape
)
{
if
(
shape
.
size
()
<
1
)
{
return
{};
}
int
dim
=
static_cast
<
int
>
(
shape
.
size
());
std
::
vector
<
int
>
pro_shape
(
shape
);
std
::
vector
<
int
>
strides
(
dim
);
int
temp
=
pro_shape
[
1
];
pro_shape
.
erase
(
pro_shape
.
begin
()
+
1
);
pro_shape
.
push_back
(
temp
);
strides
.
back
()
=
1
;
for
(
int
i
=
dim
-
2
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
pro_shape
[
i
+
1
];
}
strides
.
pop_back
();
strides
.
insert
(
strides
.
begin
()
+
1
,
1
);
return
strides
;
}
static
inline
int64_t
AlignUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
class
CudnnFusionOpCache
{
public:
static
CudnnFusionOpCache
&
Instance
()
{
static
CudnnFusionOpCache
instance
;
return
instance
;
}
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetForward
()
{
return
&
forward_cache_
;
}
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetBackward
()
{
return
&
backward_cache_
;
}
private:
CudnnFusionOpCache
()
{}
~
CudnnFusionOpCache
()
{
// Need to delete the memory of cache.
}
CudnnFusionOpCache
(
const
CudnnFusionOpCache
&
)
{}
private:
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
forward_cache_
;
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
backward_cache_
;
};
#endif // CUDNN_VERSION >= 8000
}
// namespace operators
...
...
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
浏览文件 @
767050d9
...
...
@@ -15,125 +15,320 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
namespace
dynload
=
platform
::
dynload
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
#if CUDNN_VERSION >= 8000
static
size_t
RoundUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
*
b
;
}
template
<
typename
T
>
class
CudnnNormConvolutionOp
{
public:
CudnnNormConvolutionOp
()
:
fwd_op_
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
)
{}
~
CudnnNormConvolutionOp
()
{}
struct
NormConvolutionArgs
{
NormConvolutionArgs
()
{
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
format
=
CUDNN_TENSOR_NHWC
;
compute_type
=
platform
::
CudnnDataType
<
float
>::
type
;
}
void
Set
(
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
int
padding
,
int
stride
,
int
dilation
,
int
group
)
{
PADDLE_ENFORCE_EQ
(
input_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of input_shape is expected to 4. But recieved "
"input_shape's size is %d, input_shape is [%s]."
,
input_shape
.
size
(),
framework
::
make_ddim
(
input_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of filter_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
filter_shape
.
size
(),
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
[
1
]
==
filter_shape
[
2
]
&&
(
filter_shape
[
1
]
==
1
||
filter_shape
[
1
]
==
3
),
true
,
platform
::
errors
::
InvalidArgument
(
"The filter_shape is expected to store as nhwc, and "
"h = w = 1 or 3. But recieved filter_shape is [%s]."
,
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
output_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of output_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
output_shape
.
size
(),
framework
::
make_ddim
(
output_shape
)));
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
in_dims
.
push_back
(
input_shape
[
i
]);
}
for
(
size_t
i
=
0
;
i
<
filter_shape
.
size
();
++
i
)
{
filter_dims
.
push_back
(
filter_shape
[
i
]);
}
paddings
=
{
padding
,
padding
};
strides
=
{
stride
,
stride
};
dilations
=
{
dilation
,
dilation
};
in_desc
.
set
(
input_shape
,
format
,
dtype
);
filter_desc
.
set
(
filter_shape
,
format
,
dtype
,
group
);
out_desc
.
set
(
output_shape
,
format
,
dtype
);
int
output_channel
=
filter_shape
[
0
];
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
out_stats_desc
.
set
(
stats_shape
,
format
,
compute_type
);
conv_desc
.
set
(
dtype
,
paddings
,
strides
,
dilations
,
false
,
group
);
}
void
Init
(
const
platform
::
CUDADeviceContext
&
ctx
,
cudnnDataType_t
dtype
;
cudnnTensorFormat_t
format
;
cudnnDataType_t
compute_type
;
std
::
vector
<
int64_t
>
in_dims
;
std
::
vector
<
int64_t
>
filter_dims
;
std
::
vector
<
int
>
strides
;
std
::
vector
<
int
>
paddings
;
std
::
vector
<
int
>
dilations
;
platform
::
TensorDescriptor
in_desc
;
platform
::
FilterDescriptor
filter_desc
;
platform
::
TensorDescriptor
out_desc
;
platform
::
TensorDescriptor
out_stats_desc
;
platform
::
ConvolutionDescriptor
conv_desc
;
};
template
<
typename
T
>
class
CudnnNormConvolution
{
public:
CudnnNormConvolution
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
cudnn_fwd_compute_type_
=
platform
::
CudnnDataType
<
float
>::
type
;
dtype_
=
platform
::
CudnnDataType
<
T
>::
type
;
format_
=
CUDNN_TENSOR_NHWC
;
InitDescriptors
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
pad
,
stride
,
dilate
,
group
);
GetWorkspaceSize
(
ctx
);
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
}
~
CudnnNormConvolution
()
{}
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
filter_ptr
,
T
*
output_ptr
,
float
*
sum_ptr
,
float
*
sum_of_squares_ptr
)
{
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
CudnnFusionOp
*
fwd_op
=
GetForwardOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
static_cast
<
int64_t
>
(
fwd_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// Set variant_param
// input ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
fwd_workspace_byte_
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
// output ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
workspace_handle
.
RunFunc
(
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
fwd_op
_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
fwd_op
_
.
Execute
(
handle
);
fwd_op
->
Execute
(
cudnn_
handle
);
},
fwd_workspace_byte_
);
workspace_size
);
}
// TBD
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
)
{}
private:
void
InitDescriptors
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
CudnnFusionOp
*
GetForwardOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
*
(
CudnnFusionOpCache
::
Instance
().
GetForward
());
CudnnFusionOp
*
fwd_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
fwd_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
);
// Set constant_param
fwd_op_
.
SetOpConstParamAttr
(
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_WDATA_PLACEHOLDER
,
CUDNN_PARAM_YDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
fwd_op_
.
SetOpConstParamAttr
(
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_YSUM_PLACEHOLDER
,
CUDNN_PARAM_YSQSUM_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
std
::
vector
<
int
>
pad_vec
=
{
pad
,
pad
};
std
::
vector
<
int
>
stride_vec
=
{
stride
,
stride
};
std
::
vector
<
int
>
dilate_vec
=
{
dilate
,
dilate
};
int
output_channel
=
filter_shape
[
0
];
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
// conv desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
args_
.
conv_desc
.
desc
());
// input desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
args_
.
in_desc
.
desc
());
// filter desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
args_
.
filter_desc
.
desc
());
// output desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
args_
.
out_desc
.
desc
());
// output_stats desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
args_
.
out_stats_desc
.
desc
());
// batch_norm mode
fwd_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
// set conv desc
conv_desc_
.
set
(
dtype_
,
pad_vec
,
stride_vec
,
dilate_vec
,
false
,
group
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
conv_desc_
.
desc
());
// Make cudnn fused ops plan
fwd_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
return
fwd_op
;
});
return
fwd_op
;
}
// set input desc
in_desc_
.
set
(
input_shape
,
format_
,
dtype_
)
;
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
in_desc_
.
desc
())
;
private:
NormConvolutionArgs
<
T
>
args_
;
}
;
// set filter desc
filter_desc_
.
set
(
filter_shape
,
format_
,
dtype_
,
group
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
filter_desc_
.
desc
());
template
<
typename
T
>
class
CudnnNormConvolutionGrad
{
public:
CudnnNormConvolutionGrad
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
dgrad_algo_
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
~
CudnnNormConvolutionGrad
()
{}
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
T
*
filter_grad_ptr
,
bool
use_addto
=
false
)
{
if
(
filter_grad_ptr
)
{
BackwardFilter
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
filter_grad_ptr
);
}
if
(
input_grad_ptr
)
{
BackwardData
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
use_addto
);
}
}
private:
void
BackwardFilter
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
filter_grad_ptr
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
// set output desc
out_desc_
.
set
(
output_shape
,
format_
,
dtype_
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
out_desc_
.
desc
());
CudnnFusionOp
*
wgrad_op
=
GetBackwardFilterOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
static_cast
<
int64_t
>
(
wgrad_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// set output_stats desc
out_stats_desc_
.
set
(
stats_shape
,
format_
,
cudnn_fwd_compute_type_
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
out_stats_desc_
.
desc
());
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DYDATA
,
output_grad_ptr
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DWDATA
,
filter_grad_ptr
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
fwd_op_
.
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL
);
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
wgrad_op
->
Execute
(
cudnn_handle
);
},
workspace_size
);
}
void
GetWorkspaceSize
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
auto
handle
=
ctx
.
cudnn_handle
();
fwd_workspace_byte_
=
fwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
void
BackwardData
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
bool
use_addto
=
false
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
size_t
workspace_size
=
GetWorkspaceSizeBwdData
(
ctx
);
// Convolution dgrad followed optionally by batchnorm dgrad
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
cudnn_handle
,
&
alpha
,
args_
.
filter_desc
.
desc
(),
filter_ptr
,
args_
.
out_desc
.
desc
(),
output_grad_ptr
,
args_
.
conv_desc
.
desc
(),
dgrad_algo_
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args_
.
in_desc
.
desc
(),
input_grad_ptr
));
},
workspace_size
);
}
size_t
fwd_workspace_byte_
=
0
;
CudnnFusionOp
*
GetBackwardFilterOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
*
(
CudnnFusionOpCache
::
Instance
().
GetBackward
());
CudnnFusionOp
*
wgrad_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
wgrad_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD
);
wgrad_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_DYDATA_PLACEHOLDER
,
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_DWDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
// conv desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
args_
.
conv_desc
.
desc
());
// input desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
args_
.
in_desc
.
desc
());
// filter desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DWDESC
,
args_
.
filter_desc
.
desc
());
// output desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DYDESC
,
args_
.
out_desc
.
desc
());
wgrad_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
// Make cudnn fused ops plan
wgrad_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
return
wgrad_op
;
});
return
wgrad_op
;
}
cudnnDataType_t
dtype_
;
cudnnDataType_t
cudnn_fwd_compute_type_
;
platform
::
TensorDescriptor
in_desc_
;
platform
::
FilterDescriptor
filter_desc_
;
platform
::
TensorDescriptor
out_desc_
;
platform
::
TensorDescriptor
out_stats_desc_
;
platform
::
ConvolutionDescriptor
conv_desc_
;
cudnnTensorFormat_t
format_
;
size_t
GetWorkspaceSizeBwdData
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
size_t
workspace_size
=
0U
;
auto
handle
=
ctx
.
cudnn_handle
();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
args_
.
filter_desc
.
desc
(),
args_
.
out_desc
.
desc
(),
args_
.
conv_desc
.
desc
(),
args_
.
in_desc
.
desc
(),
dgrad_algo_
,
&
workspace_size
));
return
RoundUp
(
workspace_size
,
512
);
}
CudnnFusionOp
fwd_op_
;
private:
NormConvolutionArgs
<
T
>
args_
;
cudnnConvolutionBwdDataAlgo_t
dgrad_algo_
;
};
#endif
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
浏览文件 @
767050d9
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录