Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
767050d9
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
767050d9
编写于
9月 29, 2021
作者:
Y
Yiqun Liu
提交者:
GitHub
9月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement the grad and enhance the cache of norm_convolution fusion ops. (#36168)
上级
b3d2dc7b
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
630 addition
and
253 deletion
+630
-253
paddle/fluid/framework/operator_kernel_configs.h
paddle/fluid/framework/operator_kernel_configs.h
+2
-0
paddle/fluid/operators/fused/cudnn_fusion_helper.h
paddle/fluid/operators/fused/cudnn_fusion_helper.h
+34
-31
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+276
-81
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+318
-141
未找到文件。
paddle/fluid/framework/operator_kernel_configs.h
浏览文件 @
767050d9
...
...
@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once
#include <algorithm>
#include <mutex>
#include <unordered_map>
#include <vector>
#include "glog/logging.h"
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/operators/fused/cudnn_fusion_helper.h
浏览文件 @
767050d9
...
...
@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
...
...
@@ -41,12 +39,9 @@ class CudnnFusionOp {
}
~
CudnnFusionOp
()
{
// New 'fused op' descriptor destruction
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
));
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
);
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
);
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
);
}
// Execute fused op
...
...
@@ -121,41 +116,49 @@ class CudnnFusionOp {
// Get the workspace, which is required before Execute().
size_t
GetWorkspaceSizeInBytes
(
cudnnHandle_t
cudnn_handle
)
{
size_t
workspace_bytes
=
0U
;
if
(
!
plan_created_
)
{
workspace_bytes_
=
0U
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnMakeFusedOpsPlan
(
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes
));
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes_
));
plan_created_
=
true
;
return
workspace_bytes
;
}
return
workspace_bytes_
;
}
private:
bool
plan_created_
;
size_t
workspace_bytes_
;
cudnnFusedOpsPlan_t
op_
;
cudnnFusedOpsConstParamPack_t
op_const_params_
;
cudnnFusedOpsVariantParamPack_t
op_variant_params_
;
};
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>
&
shape
)
{
if
(
shape
.
size
()
<
1
)
{
return
{};
}
int
dim
=
static_cast
<
int
>
(
shape
.
size
());
std
::
vector
<
int
>
pro_shape
(
shape
);
std
::
vector
<
int
>
strides
(
dim
);
int
temp
=
pro_shape
[
1
];
pro_shape
.
erase
(
pro_shape
.
begin
()
+
1
);
pro_shape
.
push_back
(
temp
);
strides
.
back
()
=
1
;
for
(
int
i
=
dim
-
2
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
pro_shape
[
i
+
1
];
}
strides
.
pop_back
();
strides
.
insert
(
strides
.
begin
()
+
1
,
1
);
return
strides
;
}
static
inline
int64_t
AlignUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
class
CudnnFusionOpCache
{
public:
static
CudnnFusionOpCache
&
Instance
()
{
static
CudnnFusionOpCache
instance
;
return
instance
;
}
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetForward
()
{
return
&
forward_cache_
;
}
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetBackward
()
{
return
&
backward_cache_
;
}
private:
CudnnFusionOpCache
()
{}
~
CudnnFusionOpCache
()
{
// Need to delete the memory of cache.
}
CudnnFusionOpCache
(
const
CudnnFusionOpCache
&
)
{}
private:
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
forward_cache_
;
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
backward_cache_
;
};
#endif // CUDNN_VERSION >= 8000
}
// namespace operators
...
...
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
浏览文件 @
767050d9
...
...
@@ -15,125 +15,320 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
namespace
dynload
=
platform
::
dynload
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
#if CUDNN_VERSION >= 8000
static
size_t
RoundUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
*
b
;
}
template
<
typename
T
>
class
CudnnNormConvolutionOp
{
public:
CudnnNormConvolutionOp
()
:
fwd_op_
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
)
{}
~
CudnnNormConvolutionOp
()
{}
struct
NormConvolutionArgs
{
NormConvolutionArgs
()
{
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
format
=
CUDNN_TENSOR_NHWC
;
compute_type
=
platform
::
CudnnDataType
<
float
>::
type
;
}
void
Set
(
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
int
padding
,
int
stride
,
int
dilation
,
int
group
)
{
PADDLE_ENFORCE_EQ
(
input_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of input_shape is expected to 4. But recieved "
"input_shape's size is %d, input_shape is [%s]."
,
input_shape
.
size
(),
framework
::
make_ddim
(
input_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of filter_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
filter_shape
.
size
(),
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
[
1
]
==
filter_shape
[
2
]
&&
(
filter_shape
[
1
]
==
1
||
filter_shape
[
1
]
==
3
),
true
,
platform
::
errors
::
InvalidArgument
(
"The filter_shape is expected to store as nhwc, and "
"h = w = 1 or 3. But recieved filter_shape is [%s]."
,
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
output_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of output_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
output_shape
.
size
(),
framework
::
make_ddim
(
output_shape
)));
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
in_dims
.
push_back
(
input_shape
[
i
]);
}
for
(
size_t
i
=
0
;
i
<
filter_shape
.
size
();
++
i
)
{
filter_dims
.
push_back
(
filter_shape
[
i
]);
}
paddings
=
{
padding
,
padding
};
strides
=
{
stride
,
stride
};
dilations
=
{
dilation
,
dilation
};
in_desc
.
set
(
input_shape
,
format
,
dtype
);
filter_desc
.
set
(
filter_shape
,
format
,
dtype
,
group
);
out_desc
.
set
(
output_shape
,
format
,
dtype
);
int
output_channel
=
filter_shape
[
0
];
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
out_stats_desc
.
set
(
stats_shape
,
format
,
compute_type
);
conv_desc
.
set
(
dtype
,
paddings
,
strides
,
dilations
,
false
,
group
);
}
void
Init
(
const
platform
::
CUDADeviceContext
&
ctx
,
cudnnDataType_t
dtype
;
cudnnTensorFormat_t
format
;
cudnnDataType_t
compute_type
;
std
::
vector
<
int64_t
>
in_dims
;
std
::
vector
<
int64_t
>
filter_dims
;
std
::
vector
<
int
>
strides
;
std
::
vector
<
int
>
paddings
;
std
::
vector
<
int
>
dilations
;
platform
::
TensorDescriptor
in_desc
;
platform
::
FilterDescriptor
filter_desc
;
platform
::
TensorDescriptor
out_desc
;
platform
::
TensorDescriptor
out_stats_desc
;
platform
::
ConvolutionDescriptor
conv_desc
;
};
template
<
typename
T
>
class
CudnnNormConvolution
{
public:
CudnnNormConvolution
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
cudnn_fwd_compute_type_
=
platform
::
CudnnDataType
<
float
>::
type
;
dtype_
=
platform
::
CudnnDataType
<
T
>::
type
;
format_
=
CUDNN_TENSOR_NHWC
;
InitDescriptors
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
pad
,
stride
,
dilate
,
group
);
GetWorkspaceSize
(
ctx
);
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
}
~
CudnnNormConvolution
()
{}
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
filter_ptr
,
T
*
output_ptr
,
float
*
sum_ptr
,
float
*
sum_of_squares_ptr
)
{
auto
handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
CudnnFusionOp
*
fwd_op
=
GetForwardOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
static_cast
<
int64_t
>
(
fwd_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// Set variant_param
// input ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
fwd_workspace_byte_
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
// output ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
workspace_handle
.
RunFunc
(
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
fwd_op
_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
fwd_op
_
.
Execute
(
handle
);
fwd_op
->
Execute
(
cudnn_
handle
);
},
fwd_workspace_byte_
);
workspace_size
);
}
// TBD
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
)
{}
private:
void
InitDescriptors
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
CudnnFusionOp
*
GetForwardOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
*
(
CudnnFusionOpCache
::
Instance
().
GetForward
());
CudnnFusionOp
*
fwd_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
fwd_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
);
// Set constant_param
fwd_op_
.
SetOpConstParamAttr
(
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_WDATA_PLACEHOLDER
,
CUDNN_PARAM_YDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
fwd_op_
.
SetOpConstParamAttr
(
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_YSUM_PLACEHOLDER
,
CUDNN_PARAM_YSQSUM_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
std
::
vector
<
int
>
pad_vec
=
{
pad
,
pad
};
std
::
vector
<
int
>
stride_vec
=
{
stride
,
stride
};
std
::
vector
<
int
>
dilate_vec
=
{
dilate
,
dilate
};
int
output_channel
=
filter_shape
[
0
];
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
// conv desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
args_
.
conv_desc
.
desc
());
// input desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
args_
.
in_desc
.
desc
());
// filter desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
args_
.
filter_desc
.
desc
());
// output desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
args_
.
out_desc
.
desc
());
// output_stats desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
args_
.
out_stats_desc
.
desc
());
// batch_norm mode
fwd_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
// set conv desc
conv_desc_
.
set
(
dtype_
,
pad_vec
,
stride_vec
,
dilate_vec
,
false
,
group
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
conv_desc_
.
desc
());
// Make cudnn fused ops plan
fwd_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
return
fwd_op
;
});
return
fwd_op
;
}
// set input desc
in_desc_
.
set
(
input_shape
,
format_
,
dtype_
)
;
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
in_desc_
.
desc
())
;
private:
NormConvolutionArgs
<
T
>
args_
;
}
;
// set filter desc
filter_desc_
.
set
(
filter_shape
,
format_
,
dtype_
,
group
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
filter_desc_
.
desc
());
template
<
typename
T
>
class
CudnnNormConvolutionGrad
{
public:
CudnnNormConvolutionGrad
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
dgrad_algo_
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
~
CudnnNormConvolutionGrad
()
{}
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
T
*
filter_grad_ptr
,
bool
use_addto
=
false
)
{
if
(
filter_grad_ptr
)
{
BackwardFilter
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
filter_grad_ptr
);
}
if
(
input_grad_ptr
)
{
BackwardData
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
use_addto
);
}
}
private:
void
BackwardFilter
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
filter_grad_ptr
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
// set output desc
out_desc_
.
set
(
output_shape
,
format_
,
dtype_
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
out_desc_
.
desc
());
CudnnFusionOp
*
wgrad_op
=
GetBackwardFilterOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
static_cast
<
int64_t
>
(
wgrad_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// set output_stats desc
out_stats_desc_
.
set
(
stats_shape
,
format_
,
cudnn_fwd_compute_type_
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
out_stats_desc_
.
desc
());
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DYDATA
,
output_grad_ptr
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DWDATA
,
filter_grad_ptr
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
fwd_op_
.
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL
);
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
wgrad_op
->
Execute
(
cudnn_handle
);
},
workspace_size
);
}
void
GetWorkspaceSize
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
auto
handle
=
ctx
.
cudnn_handle
();
fwd_workspace_byte_
=
fwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
void
BackwardData
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
bool
use_addto
=
false
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
size_t
workspace_size
=
GetWorkspaceSizeBwdData
(
ctx
);
// Convolution dgrad followed optionally by batchnorm dgrad
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
cudnn_handle
,
&
alpha
,
args_
.
filter_desc
.
desc
(),
filter_ptr
,
args_
.
out_desc
.
desc
(),
output_grad_ptr
,
args_
.
conv_desc
.
desc
(),
dgrad_algo_
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args_
.
in_desc
.
desc
(),
input_grad_ptr
));
},
workspace_size
);
}
size_t
fwd_workspace_byte_
=
0
;
CudnnFusionOp
*
GetBackwardFilterOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
*
(
CudnnFusionOpCache
::
Instance
().
GetBackward
());
CudnnFusionOp
*
wgrad_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
wgrad_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD
);
wgrad_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_DYDATA_PLACEHOLDER
,
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_DWDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
// conv desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
args_
.
conv_desc
.
desc
());
// input desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
args_
.
in_desc
.
desc
());
// filter desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DWDESC
,
args_
.
filter_desc
.
desc
());
// output desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DYDESC
,
args_
.
out_desc
.
desc
());
wgrad_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
// Make cudnn fused ops plan
wgrad_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
return
wgrad_op
;
});
return
wgrad_op
;
}
cudnnDataType_t
dtype_
;
cudnnDataType_t
cudnn_fwd_compute_type_
;
platform
::
TensorDescriptor
in_desc_
;
platform
::
FilterDescriptor
filter_desc_
;
platform
::
TensorDescriptor
out_desc_
;
platform
::
TensorDescriptor
out_stats_desc_
;
platform
::
ConvolutionDescriptor
conv_desc_
;
cudnnTensorFormat_t
format_
;
size_t
GetWorkspaceSizeBwdData
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
size_t
workspace_size
=
0U
;
auto
handle
=
ctx
.
cudnn_handle
();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
args_
.
filter_desc
.
desc
(),
args_
.
out_desc
.
desc
(),
args_
.
conv_desc
.
desc
(),
args_
.
in_desc
.
desc
(),
dgrad_algo_
,
&
workspace_size
));
return
RoundUp
(
workspace_size
,
512
);
}
CudnnFusionOp
fwd_op_
;
private:
NormConvolutionArgs
<
T
>
args_
;
cudnnConvolutionBwdDataAlgo_t
dgrad_algo_
;
};
#endif
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
浏览文件 @
767050d9
...
...
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <random>
#include <vector>
...
...
@@ -29,23 +30,80 @@ namespace op = paddle::operators;
using
Tensor
=
paddle
::
framework
::
Tensor
;
USE_OP
(
conv2d
);
USE_OP
(
conv2d_grad
);
USE_OP_DEVICE_KERNEL
(
conv2d
,
CUDNN
);
USE_OP_DEVICE_KERNEL
(
conv2d_grad
,
CUDNN
);
template
<
typename
T
>
void
InitRandomTensor
(
const
std
::
vector
<
int64_t
>
&
dims
,
framework
::
Tensor
*
cpu_out
)
{
T
*
cpu_out_ptr
=
cpu_out
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
dims
),
platform
::
CPUPlace
());
std
::
default_random_engine
random
(
0
);
std
::
uniform_real_distribution
<
float
>
dis
(
0.0
,
1.0
);
for
(
int
i
=
0
;
i
<
cpu_out
->
numel
();
++
i
)
{
cpu_out_ptr
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
}
}
template
<
typename
T
>
void
TransposeNchwToNhwc
(
const
framework
::
Tensor
&
cpu_in
,
framework
::
Tensor
*
cpu_out
)
{
auto
in_dims
=
cpu_in
.
dims
();
EXPECT_EQ
(
cpu_in
.
dims
().
size
(),
4
);
const
T
*
cpu_in_ptr
=
cpu_in
.
data
<
T
>
();
T
*
cpu_out_ptr
=
cpu_out
->
mutable_data
<
T
>
(
{
in_dims
[
0
],
in_dims
[
2
],
in_dims
[
3
],
in_dims
[
1
]},
platform
::
CPUPlace
());
int64_t
n
=
in_dims
[
0
];
int64_t
c
=
in_dims
[
1
];
int64_t
hw
=
in_dims
[
2
]
*
in_dims
[
3
];
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
j
=
0
;
j
<
hw
;
++
j
)
{
for
(
int
k
=
0
;
k
<
c
;
++
k
)
{
int
dst_idx
=
i
*
hw
*
c
+
j
*
c
+
k
;
int
src_idx
=
i
*
c
*
hw
+
k
*
hw
+
j
;
cpu_out_ptr
[
dst_idx
]
=
cpu_in_ptr
[
src_idx
];
}
}
}
}
template
<
typename
T
>
void
CheckOutput
(
const
framework
::
Tensor
&
cpu_res
,
const
framework
::
Tensor
&
cpu_base
,
float
diff
,
bool
is_relative_atol
=
false
)
{
EXPECT_EQ
(
cpu_res
.
dims
(),
cpu_base
.
dims
());
const
T
*
cpu_res_ptr
=
cpu_res
.
data
<
T
>
();
const
T
*
cpu_base_ptr
=
cpu_base
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
cpu_res
.
numel
();
++
i
)
{
if
(
is_relative_atol
)
{
EXPECT_LT
(
static_cast
<
float
>
(
std
::
abs
((
cpu_res_ptr
[
i
]
-
cpu_base_ptr
[
i
])
/
cpu_base_ptr
[
i
])),
diff
);
}
else
{
EXPECT_LT
(
static_cast
<
float
>
(
std
::
abs
(
cpu_res_ptr
[
i
]
-
cpu_base_ptr
[
i
])),
diff
);
}
}
}
//
get p
addle conv2d op results as baseline
//
Use P
addle conv2d op results as baseline
template
<
typename
T
>
void
Conv2DForwardCompute
(
const
Tensor
&
x
,
const
Tensor
&
w
,
Tensor
*
y
,
const
platform
::
CUDADeviceContext
&
ctx
)
{
void
ComputeConv2DForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
Tensor
*
cpu_output
)
{
framework
::
Scope
scope
;
auto
var_x
=
scope
.
Var
(
"Input"
);
auto
tensor_x
=
var_x
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
var_w
=
scope
.
Var
(
"Filter"
);
auto
tensor_w
=
var_w
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
var_y
=
scope
.
Var
(
"Output"
);
auto
tensor_y
=
var_y
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
input
=
scope
.
Var
(
"Input"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
filter
=
scope
.
Var
(
"Filter"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
output
=
scope
.
Var
(
"Output"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
x
,
place
,
tensor_x
);
TensorCopySync
(
w
,
place
,
tensor_w
);
TensorCopySync
(
cpu_input
,
place
,
input
);
TensorCopySync
(
cpu_filter
,
place
,
filter
);
framework
::
AttributeMap
attrs
;
bool
use_cudnn
=
true
;
...
...
@@ -60,25 +118,94 @@ void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y,
{{
"Output"
,
{
"Output"
}}},
attrs
);
op
->
Run
(
scope
,
ctx
.
GetPlace
());
TensorCopySync
(
*
tensor_y
,
place
,
y
);
ctx
.
Wait
();
TensorCopySync
(
*
output
,
platform
::
CPUPlace
(),
cpu_output
);
}
// Use Paddle conv2d_grad op results as baseline
template
<
typename
T
>
class
TestCudnnNormConvOpForward
{
public:
TestCudnnNormConvOpForward
()
{
batch_size_
=
2
;
height_
=
8
;
width_
=
8
;
input_channels_
=
8
;
output_channels_
=
32
;
kernel_size_
=
1
;
stride_
=
1
;
pad_
=
0
;
void
ComputeConv2DBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
const
Tensor
&
cpu_output_grad
,
framework
::
Tensor
*
cpu_input_grad
,
framework
::
Tensor
*
cpu_filter_grad
,
int
stride
,
int
padding
,
int
dilation
)
{
framework
::
Scope
scope
;
auto
*
input
=
scope
.
Var
(
"Input"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
filter
=
scope
.
Var
(
"Filter"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
output_grad
=
scope
.
Var
(
"Output@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
input_grad
=
scope
.
Var
(
"Input@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
filter_grad
=
scope
.
Var
(
"Filter@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_input
,
place
,
input
);
TensorCopySync
(
cpu_filter
,
place
,
filter
);
TensorCopySync
(
cpu_output_grad
,
place
,
output_grad
);
framework
::
AttributeMap
attrs
;
bool
use_cudnn
=
true
;
std
::
string
data_format
=
"NHWC"
;
std
::
string
padding_algorithm
=
"SAME"
;
std
::
vector
<
int
>
strides
=
{
stride
,
stride
};
std
::
vector
<
int
>
paddings
=
{
padding
,
padding
};
std
::
vector
<
int
>
dilations
=
{
dilation
,
dilation
};
int
groups
=
1
;
bool
exhaustive_search
=
false
;
bool
use_addto
=
false
;
attrs
.
insert
({
"use_cudnn"
,
use_cudnn
});
attrs
.
insert
({
"data_format"
,
data_format
});
attrs
.
insert
({
"padding_algorithm"
,
padding_algorithm
});
attrs
.
insert
({
"strides"
,
strides
});
attrs
.
insert
({
"paddings"
,
paddings
});
attrs
.
insert
({
"dilations"
,
dilations
});
attrs
.
insert
({
"groups"
,
groups
});
attrs
.
insert
({
"exhaustive_search"
,
exhaustive_search
});
attrs
.
insert
({
"use_addto"
,
use_addto
});
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
"conv2d_grad"
,
{{
"Input"
,
{
"Input"
}},
{
"Filter"
,
{
"Filter"
}},
{
"Output@GRAD"
,
{
"Output@GRAD"
}}},
{{
"Input@GRAD"
,
{
"Input@GRAD"
}},
{
"Filter@GRAD"
,
{
"Filter@GRAD"
}}},
attrs
);
op
->
Run
(
scope
,
ctx
.
GetPlace
());
TensorCopySync
(
*
input_grad
,
platform
::
CPUPlace
(),
cpu_input_grad
);
TensorCopySync
(
*
filter_grad
,
platform
::
CPUPlace
(),
cpu_filter_grad
);
}
template
<
typename
T
>
void
ComputeSumAndSquareSum
(
const
framework
::
Tensor
&
cpu_out
,
framework
::
Tensor
*
cpu_sum
,
framework
::
Tensor
*
cpu_sum_of_square
)
{
auto
dims
=
cpu_out
.
dims
();
int64_t
c
=
dims
[
3
];
const
T
*
cpu_out_ptr
=
cpu_out
.
data
<
T
>
();
float
*
cpu_sum_ptr
=
cpu_sum
->
mutable_data
<
float
>
({
1
,
1
,
1
,
c
},
platform
::
CPUPlace
());
float
*
cpu_sum_square_ptr
=
cpu_sum_of_square
->
mutable_data
<
float
>
(
{
1
,
1
,
1
,
c
},
platform
::
CPUPlace
());
for
(
int
j
=
0
;
j
<
c
;
++
j
)
{
float
tmp_sum
=
0.0
f
;
float
tmp_sum_of_squares
=
0.0
f
;
for
(
int
i
=
0
;
i
<
cpu_out
.
numel
()
/
c
;
++
i
)
{
float
tmp_out
=
static_cast
<
float
>
(
cpu_out_ptr
[
i
*
c
+
j
]);
tmp_sum
+=
tmp_out
;
tmp_sum_of_squares
+=
tmp_out
*
tmp_out
;
}
cpu_sum_ptr
[
j
]
=
tmp_sum
;
cpu_sum_square_ptr
[
j
]
=
tmp_sum_of_squares
;
}
}
TestCudnnNormConvOpForward
(
int
batch_size
,
int
height
,
int
width
,
template
<
typename
T
>
class
CudnnNormConvolutionTester
{
public:
CudnnNormConvolutionTester
(
int
batch_size
,
int
height
,
int
width
,
int
input_channels
,
int
output_channels
,
int
kernel_size
,
int
stride
)
{
batch_size_
=
batch_size
;
...
...
@@ -88,133 +215,183 @@ class TestCudnnNormConvOpForward {
output_channels_
=
output_channels
;
kernel_size_
=
kernel_size
;
stride_
=
stride
;
pad_
=
(
kernel_size_
-
1
)
/
2
;
padding_
=
(
kernel_size_
-
1
)
/
2
;
SetUp
();
}
~
TestCudnnNormConvOpForward
()
{}
~
CudnnNormConvolutionTester
()
{}
void
SetUp
()
{
input_size_
=
batch_size_
*
height_
*
width_
*
input_channels_
;
filter_size_
=
output_channels_
*
input_channels_
*
kernel_size_
*
kernel_size_
;
output_size_
=
batch_size_
*
height_
*
width_
*
output_channels_
;
param_size_
=
output_channels_
;
void
CheckForward
(
float
diff
,
bool
is_relative_atol
=
false
)
{
platform
::
CUDADeviceContext
*
ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
input_vec_
.
resize
(
input_size_
);
filter_raw_vec_
.
resize
(
filter_size_
);
filter_pro_vec_
.
resize
(
filter_size_
);
framework
::
Tensor
cpu_output_base
;
framework
::
Tensor
cpu_sum_base
;
framework
::
Tensor
cpu_sum_of_square_base
;
BaselineForward
(
*
ctx
,
&
cpu_output_base
,
&
cpu_sum_base
,
&
cpu_sum_of_square_base
);
std
::
default_random_engine
random
(
0
);
std
::
uniform_real_distribution
<
float
>
dis
(
0.0
,
1.0
);
for
(
int
i
=
0
;
i
<
input_size_
;
++
i
)
{
input_vec_
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
}
for
(
int
i
=
0
;
i
<
filter_size_
;
++
i
)
{
filter_raw_vec_
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
}
// transpoes for filter
// NCHW->NHWC
for
(
int
oc
=
0
;
oc
<
output_channels_
;
++
oc
)
{
for
(
int
kh
=
0
;
kh
<
kernel_size_
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_size_
;
++
kw
)
{
for
(
int
ic
=
0
;
ic
<
input_channels_
;
++
ic
)
{
int
dst_idx
=
oc
*
kernel_size_
*
kernel_size_
*
input_channels_
+
kh
*
kernel_size_
*
input_channels_
+
kw
*
input_channels_
+
ic
;
int
src_idx
=
oc
*
kernel_size_
*
kernel_size_
*
input_channels_
+
ic
*
kernel_size_
*
kernel_size_
+
kh
*
kernel_size_
+
kw
;
filter_pro_vec_
[
dst_idx
]
=
filter_raw_vec_
[
src_idx
];
}
framework
::
Tensor
cpu_output
;
framework
::
Tensor
cpu_sum
;
framework
::
Tensor
cpu_sum_of_square
;
FusedForward
(
*
ctx
,
&
cpu_output
,
&
cpu_sum
,
&
cpu_sum_of_square
);
// Check forward correctness between baseline and results of normconv.
CheckOutput
<
T
>
(
cpu_output
,
cpu_output_base
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
cpu_sum
,
cpu_sum_base
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
cpu_sum_of_square
,
cpu_sum_of_square_base
,
diff
,
is_relative_atol
);
}
void
CheckBackward
(
float
diff
,
bool
is_relative_atol
=
false
)
{
platform
::
CUDADeviceContext
*
ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
framework
::
Tensor
cpu_input_grad_base
;
framework
::
Tensor
cpu_filter_nchw_grad_base
;
framework
::
Tensor
cpu_filter_nhwc_grad_base
;
BaselineBackward
(
*
ctx
,
&
cpu_input_grad_base
,
&
cpu_filter_nchw_grad_base
);
TransposeNchwToNhwc
<
T
>
(
cpu_filter_nchw_grad_base
,
&
cpu_filter_nhwc_grad_base
);
framework
::
Tensor
cpu_input_grad
;
framework
::
Tensor
cpu_filter_nhwc_grad
;
FusedBackward
(
*
ctx
,
&
cpu_input_grad
,
&
cpu_filter_nhwc_grad
);
// Check backward correctness between baseline and results of normconv.
CheckOutput
<
T
>
(
cpu_input_grad
,
cpu_input_grad_base
,
diff
,
is_relative_atol
);
CheckOutput
<
T
>
(
cpu_filter_nhwc_grad
,
cpu_filter_nhwc_grad_base
,
diff
,
is_relative_atol
);
}
private:
void
SetUp
()
{
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
input_channels_
},
&
cpu_input_
);
InitRandomTensor
<
T
>
(
{
output_channels_
,
input_channels_
,
kernel_size_
,
kernel_size_
},
&
cpu_filter_nchw_
);
// transpoes for filter, NCHW -> NHWC
TransposeNchwToNhwc
<
T
>
(
cpu_filter_nchw_
,
&
cpu_filter_nhwc_
);
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
output_channels_
},
&
cpu_output_grad_
);
}
framework
::
TensorFromVector
<
T
>
(
input_vec_
,
*
ctx_
,
&
input_
);
input_
.
Resize
({
batch_size_
,
height_
,
width_
,
input_channels_
});
framework
::
TensorFromVector
<
T
>
(
filter_raw_vec_
,
*
ctx_
,
&
filter_raw_
);
filter_raw_
.
Resize
(
{
output_channels_
,
input_channels_
,
kernel_size_
,
kernel_size_
});
framework
::
TensorFromVector
<
T
>
(
filter_pro_vec_
,
*
ctx_
,
&
filter_pro_
);
filter_pro_
.
Resize
(
{
output_channels_
,
kernel_size_
,
kernel_size_
,
input_channels_
});
output_
.
Resize
({
batch_size_
,
height_
,
width_
,
output_channels_
});
base_output_
.
Resize
({
batch_size_
,
height_
,
width_
,
output_channels_
});
sum_
.
Resize
({
1
,
1
,
1
,
output_channels_
});
sum_of_squares_
.
Resize
({
1
,
1
,
1
,
output_channels_
});
ctx_
->
Wait
();
void
BaselineForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
framework
::
Tensor
*
cpu_output_base
,
framework
::
Tensor
*
cpu_sum_base
,
framework
::
Tensor
*
cpu_sum_of_square_base
)
{
ComputeConv2DForward
<
T
>
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
cpu_output_base
);
ComputeSumAndSquareSum
<
T
>
(
*
cpu_output_base
,
cpu_sum_base
,
cpu_sum_of_square_base
);
}
void
BaselineForward
()
{
Conv2DForwardCompute
<
T
>
(
input_
,
filter_raw_
,
&
base_output_
,
*
ctx_
);
ctx_
->
Wait
();
void
BaselineBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
framework
::
Tensor
*
cpu_input_grad_base
,
framework
::
Tensor
*
cpu_filter_grad_base
)
{
ComputeConv2DBackward
<
T
>
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
cpu_output_grad_
,
cpu_input_grad_base
,
cpu_filter_grad_base
,
stride_
,
padding_
,
dilation_
);
}
// get forward results of cudnn_norm_conv
void
FusedForward
()
{
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input_
.
dims
());
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_pro_
.
dims
());
auto
output_shape
=
framework
::
vectorize
<
int
>
(
output_
.
dims
());
T
*
input_ptr
=
input_
.
data
<
T
>
();
T
*
filter_ptr
=
filter_pro_
.
data
<
T
>
();
T
*
output_ptr
=
output_
.
mutable_data
<
T
>
(
place_
);
float
*
sum_ptr
=
sum_
.
mutable_data
<
float
>
(
place_
);
float
*
sum_of_squares_ptr
=
sum_of_squares_
.
mutable_data
<
float
>
(
place_
);
std
::
shared_ptr
<
op
::
CudnnNormConvolutionOp
<
T
>>
conv_op
(
new
op
::
CudnnNormConvolutionOp
<
T
>
());
conv_op
->
Init
(
*
ctx_
,
input_shape
,
filter_shape
,
output_shape
,
pad_
,
stride_
,
dilate_
,
group_
);
conv_op
->
Forward
(
*
ctx_
,
input_ptr
,
filter_ptr
,
output_ptr
,
sum_ptr
,
sum_of_squares_ptr
);
ctx_
->
Wait
();
}
void
FusedForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
framework
::
Tensor
*
cpu_output
,
framework
::
Tensor
*
cpu_sum
,
framework
::
Tensor
*
cpu_sum_of_square
)
{
framework
::
Tensor
input
;
framework
::
Tensor
filter_nhwc
;
framework
::
Tensor
output
;
framework
::
Tensor
sum
;
framework
::
Tensor
sum_of_square
;
void
Run
()
{
SetUp
();
BaselineForward
();
FusedForward
();
}
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_input_
,
place
,
&
input
);
TensorCopySync
(
cpu_filter_nhwc_
,
place
,
&
filter_nhwc
);
// check forward correctness between baseline and results of normconv.
void
CheckOut
(
const
T
diff
,
bool
is_relative_atol
=
false
)
{
std
::
vector
<
T
>
base_output_vec
,
output_vec
;
output_vec
.
resize
(
output_size_
);
base_output_vec
.
resize
(
output_size_
);
TensorToVector
(
base_output_
,
*
ctx_
,
&
base_output_vec
);
TensorToVector
(
output_
,
*
ctx_
,
&
output_vec
);
ctx_
->
Wait
(
);
T
*
input_ptr
=
input
.
data
<
T
>
();
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
T
*
output_ptr
=
output
.
mutable_data
<
T
>
(
{
batch_size_
,
height_
,
width_
,
output_channels_
},
place
);
float
*
sum_ptr
=
sum
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
float
*
sum_of_square_ptr
=
sum_of_square
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
for
(
int
i
=
0
;
i
<
output_size_
;
++
i
)
{
if
(
is_relative_atol
)
{
EXPECT_LT
(
std
::
abs
((
output_vec
[
i
]
-
base_output_vec
[
i
])
/
base_output_vec
[
i
]),
diff
);
}
else
{
EXPECT_LT
(
std
::
abs
(
output_vec
[
i
]
-
base_output_vec
[
i
]),
diff
);
}
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input
.
dims
());
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_nhwc
.
dims
());
auto
output_shape
=
framework
::
vectorize
<
int
>
(
output
.
dims
());
op
::
CudnnNormConvolution
<
T
>
conv_op
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding_
,
stride_
,
dilation_
,
group_
);
conv_op
.
Forward
(
ctx
,
input_ptr
,
filter_ptr
,
output_ptr
,
sum_ptr
,
sum_of_square_ptr
);
TensorCopySync
(
output
,
platform
::
CPUPlace
(),
cpu_output
);
TensorCopySync
(
sum
,
platform
::
CPUPlace
(),
cpu_sum
);
TensorCopySync
(
sum_of_square
,
platform
::
CPUPlace
(),
cpu_sum_of_square
);
}
void
FusedBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
framework
::
Tensor
*
cpu_input_grad
,
framework
::
Tensor
*
cpu_filter_grad
)
{
framework
::
Tensor
input
;
framework
::
Tensor
filter_nhwc
;
framework
::
Tensor
output_grad
;
framework
::
Tensor
input_grad
;
framework
::
Tensor
filter_grad
;
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_input_
,
place
,
&
input
);
TensorCopySync
(
cpu_filter_nhwc_
,
place
,
&
filter_nhwc
);
TensorCopySync
(
cpu_output_grad_
,
place
,
&
output_grad
);
T
*
input_ptr
=
input
.
data
<
T
>
();
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
T
*
output_grad_ptr
=
output_grad
.
data
<
T
>
();
T
*
input_grad_ptr
=
input_grad
.
mutable_data
<
T
>
(
input
.
dims
(),
place
);
T
*
filter_grad_ptr
=
filter_grad
.
mutable_data
<
T
>
(
filter_nhwc
.
dims
(),
place
);
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input
.
dims
());
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_nhwc
.
dims
());
auto
output_shape
=
framework
::
vectorize
<
int
>
(
output_grad
.
dims
());
op
::
CudnnNormConvolutionGrad
<
T
>
conv_grad_op
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding_
,
stride_
,
dilation_
,
group_
);
conv_grad_op
.
Backward
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
filter_grad_ptr
);
TensorCopySync
(
input_grad
,
platform
::
CPUPlace
(),
cpu_input_grad
);
TensorCopySync
(
filter_grad
,
platform
::
CPUPlace
(),
cpu_filter_grad
);
}
private:
int
batch_size_
,
height_
,
width_
,
input_channels_
,
output_channels_
;
int
kernel_size_
,
stride_
,
pad_
;
const
int
dilate_
=
1
;
int
batch_size_
;
int
height_
;
int
width_
;
int
input_channels_
;
int
output_channels_
;
int
kernel_size_
;
int
stride_
;
int
padding_
;
const
int
dilation_
=
1
;
const
int
group_
=
1
;
int
input_size_
,
filter_size_
,
output_size_
,
param_size_
;
framework
::
Tensor
input_
,
filter_raw_
,
filter_pro_
,
output_
,
base_output_
;
framework
::
Tensor
sum_
,
sum_of_squares_
;
std
::
vector
<
T
>
input_vec_
,
filter_raw_vec_
,
filter_pro_vec_
;
// Forward input
framework
::
Tensor
cpu_input_
;
framework
::
Tensor
cpu_filter_nchw_
;
framework
::
Tensor
cpu_filter_nhwc_
;
platform
::
CUDAPlace
place_
=
platform
::
CUDAPlace
(
0
);
platform
::
CUDADeviceContext
*
ctx_
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
));
// Backward input
framework
::
Tensor
cpu_output_grad_
;
};
// test for fp16, kernel = 1, output_channels = input_channels
TEST
(
CudnnNormConvF
orward
,
GPUCudnnNormConvForward1Fp16
)
{
TEST
(
CudnnNormConvF
p16
,
K1S1
)
{
int
batch_size
=
4
;
int
height
=
56
;
int
width
=
56
;
...
...
@@ -222,15 +399,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) {
int
output_channels
=
32
;
int
kernel_size
=
1
;
int
stride
=
1
;
TestCudnnNormConvOpForward
<
paddle
::
platform
::
float16
>
test
(
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
test
.
Run
(
);
test
.
Check
Out
(
static_cast
<
paddle
::
platform
::
float16
>
(
1e-3
)
,
true
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
Check
Backward
(
1e-3
,
true
);
}
// test for fp16, kernel = 3, output_channels = input_channels
TEST
(
CudnnNormConvF
orward
,
GPUCudnnNormConvForward2Fp16
)
{
TEST
(
CudnnNormConvF
p16
,
K3S1
)
{
int
batch_size
=
4
;
int
height
=
56
;
int
width
=
56
;
...
...
@@ -238,15 +415,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) {
int
output_channels
=
32
;
int
kernel_size
=
3
;
int
stride
=
1
;
TestCudnnNormConvOpForward
<
paddle
::
platform
::
float16
>
test
(
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
test
.
Run
(
);
test
.
Check
Out
(
static_cast
<
paddle
::
platform
::
float16
>
(
1e-3
)
,
true
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
Check
Backward
(
1e-3
,
true
);
}
// test for fp16, kernel = 1, output_channels = input_channels * 4
TEST
(
CudnnNormConvF
orward
,
GPUCudnnNormConvForward3Fp16
)
{
TEST
(
CudnnNormConvF
p16
,
K1S1O4
)
{
int
batch_size
=
4
;
int
height
=
56
;
int
width
=
56
;
...
...
@@ -254,9 +431,9 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) {
int
output_channels
=
128
;
int
kernel_size
=
1
;
int
stride
=
1
;
TestCudnnNormConvOpForward
<
paddle
::
platform
::
float16
>
test
(
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
test
.
Run
(
);
test
.
Check
Out
(
static_cast
<
paddle
::
platform
::
float16
>
(
1e-3
)
,
true
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
Check
Backward
(
1e-3
,
true
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录