Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
767050d9
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
767050d9
编写于
9月 29, 2021
作者:
Y
Yiqun Liu
提交者:
GitHub
9月 29, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement the grad and enhance the cache of norm_convolution fusion ops. (#36168)
上级
b3d2dc7b
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
630 addition
and
253 deletion
+630
-253
paddle/fluid/framework/operator_kernel_configs.h
paddle/fluid/framework/operator_kernel_configs.h
+2
-0
paddle/fluid/operators/fused/cudnn_fusion_helper.h
paddle/fluid/operators/fused/cudnn_fusion_helper.h
+34
-31
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+276
-81
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+318
-141
未找到文件。
paddle/fluid/framework/operator_kernel_configs.h
浏览文件 @
767050d9
...
@@ -15,8 +15,10 @@ limitations under the License. */
...
@@ -15,8 +15,10 @@ limitations under the License. */
#pragma once
#pragma once
#include <algorithm>
#include <algorithm>
#include <mutex>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "glog/logging.h"
namespace
paddle
{
namespace
paddle
{
namespace
framework
{
namespace
framework
{
...
...
paddle/fluid/operators/fused/cudnn_fusion_helper.h
浏览文件 @
767050d9
...
@@ -14,10 +14,8 @@ limitations under the License. */
...
@@ -14,10 +14,8 @@ limitations under the License. */
#pragma once
#pragma once
#include <string>
#include <vector>
#include <vector>
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
...
@@ -41,12 +39,9 @@ class CudnnFusionOp {
...
@@ -41,12 +39,9 @@ class CudnnFusionOp {
}
}
~
CudnnFusionOp
()
{
~
CudnnFusionOp
()
{
// New 'fused op' descriptor destruction
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
);
dynload
::
cudnnDestroyFusedOpsVariantParamPack
(
op_variant_params_
));
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsConstParamPack
(
op_const_params_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnDestroyFusedOpsPlan
(
op_
));
}
}
// Execute fused op
// Execute fused op
...
@@ -121,41 +116,49 @@ class CudnnFusionOp {
...
@@ -121,41 +116,49 @@ class CudnnFusionOp {
// Get the workspace, which is required before Execute().
// Get the workspace, which is required before Execute().
size_t
GetWorkspaceSizeInBytes
(
cudnnHandle_t
cudnn_handle
)
{
size_t
GetWorkspaceSizeInBytes
(
cudnnHandle_t
cudnn_handle
)
{
size_t
workspace_bytes
=
0U
;
if
(
!
plan_created_
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnMakeFusedOpsPlan
(
workspace_bytes_
=
0U
;
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
dynload
::
cudnnMakeFusedOpsPlan
(
plan_created_
=
true
;
cudnn_handle
,
op_
,
op_const_params_
,
&
workspace_bytes_
));
return
workspace_bytes
;
plan_created_
=
true
;
}
return
workspace_bytes_
;
}
}
private:
private:
bool
plan_created_
;
bool
plan_created_
;
size_t
workspace_bytes_
;
cudnnFusedOpsPlan_t
op_
;
cudnnFusedOpsPlan_t
op_
;
cudnnFusedOpsConstParamPack_t
op_const_params_
;
cudnnFusedOpsConstParamPack_t
op_const_params_
;
cudnnFusedOpsVariantParamPack_t
op_variant_params_
;
cudnnFusedOpsVariantParamPack_t
op_variant_params_
;
};
};
static
inline
std
::
vector
<
int
>
GetStrides
(
const
std
::
vector
<
int
>
&
shape
)
{
class
CudnnFusionOpCache
{
if
(
shape
.
size
()
<
1
)
{
public:
return
{};
static
CudnnFusionOpCache
&
Instance
()
{
static
CudnnFusionOpCache
instance
;
return
instance
;
}
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetForward
()
{
return
&
forward_cache_
;
}
}
int
dim
=
static_cast
<
int
>
(
shape
.
size
());
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
*
GetBackward
()
{
std
::
vector
<
int
>
pro_shape
(
shape
);
return
&
backward_cache_
;
std
::
vector
<
int
>
strides
(
dim
);
int
temp
=
pro_shape
[
1
];
pro_shape
.
erase
(
pro_shape
.
begin
()
+
1
);
pro_shape
.
push_back
(
temp
);
strides
.
back
()
=
1
;
for
(
int
i
=
dim
-
2
;
i
>=
0
;
--
i
)
{
strides
[
i
]
=
strides
[
i
+
1
]
*
pro_shape
[
i
+
1
];
}
}
strides
.
pop_back
();
strides
.
insert
(
strides
.
begin
()
+
1
,
1
);
return
strides
;
}
static
inline
int64_t
AlignUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
;
}
private:
CudnnFusionOpCache
()
{}
~
CudnnFusionOpCache
()
{
// Need to delete the memory of cache.
}
CudnnFusionOpCache
(
const
CudnnFusionOpCache
&
)
{}
private:
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
forward_cache_
;
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
backward_cache_
;
};
#endif // CUDNN_VERSION >= 8000
#endif // CUDNN_VERSION >= 8000
}
// namespace operators
}
// namespace operators
...
...
paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
浏览文件 @
767050d9
...
@@ -15,125 +15,320 @@ limitations under the License. */
...
@@ -15,125 +15,320 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
#include "paddle/fluid/platform/cudnn_desc.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
using
Tensor
=
framework
::
Tensor
;
namespace
dynload
=
platform
::
dynload
;
namespace
dynload
=
platform
::
dynload
;
template
<
typename
T
>
using
ScalingParamType
=
typename
platform
::
CudnnDataType
<
T
>::
ScalingParamType
;
#if CUDNN_VERSION >= 8000
#if CUDNN_VERSION >= 8000
static
size_t
RoundUp
(
int64_t
a
,
int64_t
b
)
{
return
(
a
+
b
-
1
)
/
b
*
b
;
}
template
<
typename
T
>
template
<
typename
T
>
class
CudnnNormConvolutionOp
{
struct
NormConvolutionArgs
{
NormConvolutionArgs
()
{
dtype
=
platform
::
CudnnDataType
<
T
>::
type
;
format
=
CUDNN_TENSOR_NHWC
;
compute_type
=
platform
::
CudnnDataType
<
float
>::
type
;
}
void
Set
(
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
int
padding
,
int
stride
,
int
dilation
,
int
group
)
{
PADDLE_ENFORCE_EQ
(
input_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of input_shape is expected to 4. But recieved "
"input_shape's size is %d, input_shape is [%s]."
,
input_shape
.
size
(),
framework
::
make_ddim
(
input_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of filter_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
filter_shape
.
size
(),
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
filter_shape
[
1
]
==
filter_shape
[
2
]
&&
(
filter_shape
[
1
]
==
1
||
filter_shape
[
1
]
==
3
),
true
,
platform
::
errors
::
InvalidArgument
(
"The filter_shape is expected to store as nhwc, and "
"h = w = 1 or 3. But recieved filter_shape is [%s]."
,
framework
::
make_ddim
(
filter_shape
)));
PADDLE_ENFORCE_EQ
(
output_shape
.
size
(),
4U
,
platform
::
errors
::
InvalidArgument
(
"The size of output_shape is expected to 4. But recieved "
"filter_shape's size is %d, filter_shape is [%s]."
,
output_shape
.
size
(),
framework
::
make_ddim
(
output_shape
)));
for
(
size_t
i
=
0
;
i
<
input_shape
.
size
();
++
i
)
{
in_dims
.
push_back
(
input_shape
[
i
]);
}
for
(
size_t
i
=
0
;
i
<
filter_shape
.
size
();
++
i
)
{
filter_dims
.
push_back
(
filter_shape
[
i
]);
}
paddings
=
{
padding
,
padding
};
strides
=
{
stride
,
stride
};
dilations
=
{
dilation
,
dilation
};
in_desc
.
set
(
input_shape
,
format
,
dtype
);
filter_desc
.
set
(
filter_shape
,
format
,
dtype
,
group
);
out_desc
.
set
(
output_shape
,
format
,
dtype
);
int
output_channel
=
filter_shape
[
0
];
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
out_stats_desc
.
set
(
stats_shape
,
format
,
compute_type
);
conv_desc
.
set
(
dtype
,
paddings
,
strides
,
dilations
,
false
,
group
);
}
cudnnDataType_t
dtype
;
cudnnTensorFormat_t
format
;
cudnnDataType_t
compute_type
;
std
::
vector
<
int64_t
>
in_dims
;
std
::
vector
<
int64_t
>
filter_dims
;
std
::
vector
<
int
>
strides
;
std
::
vector
<
int
>
paddings
;
std
::
vector
<
int
>
dilations
;
platform
::
TensorDescriptor
in_desc
;
platform
::
FilterDescriptor
filter_desc
;
platform
::
TensorDescriptor
out_desc
;
platform
::
TensorDescriptor
out_stats_desc
;
platform
::
ConvolutionDescriptor
conv_desc
;
};
template
<
typename
T
>
class
CudnnNormConvolution
{
public:
public:
CudnnNormConvolutionOp
()
CudnnNormConvolution
(
const
platform
::
CUDADeviceContext
&
ctx
,
:
fwd_op_
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
)
{}
const
std
::
vector
<
int
>
&
input_shape
,
~
CudnnNormConvolutionOp
()
{}
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
void
Init
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
int
&
stride
,
const
int
&
dilation
,
const
std
::
vector
<
int
>
&
input_shape
,
const
int
&
group
)
{
const
std
::
vector
<
int
>
&
filter_shape
,
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
dilation
,
group
);
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
cudnn_fwd_compute_type_
=
platform
::
CudnnDataType
<
float
>::
type
;
dtype_
=
platform
::
CudnnDataType
<
T
>::
type
;
format_
=
CUDNN_TENSOR_NHWC
;
InitDescriptors
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
pad
,
stride
,
dilate
,
group
);
GetWorkspaceSize
(
ctx
);
}
}
~
CudnnNormConvolution
()
{}
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
void
Forward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
T
*
filter_ptr
,
T
*
output_ptr
,
float
*
sum_ptr
,
T
*
filter_ptr
,
T
*
output_ptr
,
float
*
sum_ptr
,
float
*
sum_of_squares_ptr
)
{
float
*
sum_of_squares_ptr
)
{
auto
handle
=
ctx
.
cudnn_handle
();
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
auto
workspace_handle
=
ctx
.
cudnn_workspace_handle
();
CudnnFusionOp
*
fwd_op
=
GetForwardOp
(
ctx
);
size_t
workspace_size
=
RoundUp
(
static_cast
<
int64_t
>
(
fwd_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// Set variant_param
// Set variant_param
// input ptr
// input ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WDATA
,
filter_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
fwd_workspace_byte_
);
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
// output ptr
// output ptr
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YDATA
,
output_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSUM
,
sum_ptr
);
fwd_op_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_YSQSUM
,
sum_of_squares_ptr
);
workspace_handle
.
RunFunc
(
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
// workspace ptr
fwd_op
_
.
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
fwd_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
// fused op execute
fwd_op
_
.
Execute
(
handle
);
fwd_op
->
Execute
(
cudnn_
handle
);
},
},
fwd_workspace_byte_
);
workspace_size
);
}
}
// TBD
private:
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
)
{}
CudnnFusionOp
*
GetForwardOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
*
(
CudnnFusionOpCache
::
Instance
().
GetForward
());
CudnnFusionOp
*
fwd_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
fwd_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS
);
// Set constant_param
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_WDATA_PLACEHOLDER
,
CUDNN_PARAM_YDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
fwd_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_YSUM_PLACEHOLDER
,
CUDNN_PARAM_YSQSUM_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
// conv desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
args_
.
conv_desc
.
desc
());
// input desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
args_
.
in_desc
.
desc
());
// filter desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
args_
.
filter_desc
.
desc
());
// output desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
args_
.
out_desc
.
desc
());
// output_stats desc
fwd_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
args_
.
out_stats_desc
.
desc
());
// batch_norm mode
fwd_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
// Make cudnn fused ops plan
fwd_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
return
fwd_op
;
});
return
fwd_op
;
}
private:
private:
void
InitDescriptors
(
const
platform
::
CUDADeviceContext
&
ctx
,
NormConvolutionArgs
<
T
>
args_
;
const
std
::
vector
<
int
>
&
input_shape
,
};
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
pad
,
const
int
&
stride
,
const
int
&
dilate
,
const
int
&
group
)
{
// Set constant_param
fwd_op_
.
SetOpConstParamAttr
(
{
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_WDATA_PLACEHOLDER
,
CUDNN_PARAM_YDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
fwd_op_
.
SetOpConstParamAttr
(
{
CUDNN_PARAM_YSUM_PLACEHOLDER
,
CUDNN_PARAM_YSQSUM_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
std
::
vector
<
int
>
pad_vec
=
{
pad
,
pad
};
std
::
vector
<
int
>
stride_vec
=
{
stride
,
stride
};
std
::
vector
<
int
>
dilate_vec
=
{
dilate
,
dilate
};
int
output_channel
=
filter_shape
[
0
];
std
::
vector
<
int
>
stats_shape
=
{
1
,
1
,
1
,
output_channel
};
// set conv desc
template
<
typename
T
>
conv_desc_
.
set
(
dtype_
,
pad_vec
,
stride_vec
,
dilate_vec
,
false
,
group
);
class
CudnnNormConvolutionGrad
{
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
conv_desc_
.
desc
());
public:
CudnnNormConvolutionGrad
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
int
>
&
input_shape
,
const
std
::
vector
<
int
>
&
filter_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
const
int
&
padding
,
const
int
&
stride
,
const
int
&
dilation
,
const
int
&
group
)
{
args_
.
Set
(
input_shape
,
filter_shape
,
output_shape
,
padding
,
stride
,
dilation
,
group
);
dgrad_algo_
=
CUDNN_CONVOLUTION_BWD_DATA_ALGO_1
;
}
~
CudnnNormConvolutionGrad
()
{}
// set input desc
void
Backward
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
in_desc_
.
set
(
input_shape
,
format_
,
dtype_
);
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
in_desc_
.
desc
());
T
*
filter_grad_ptr
,
bool
use_addto
=
false
)
{
if
(
filter_grad_ptr
)
{
BackwardFilter
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
filter_grad_ptr
);
}
if
(
input_grad_ptr
)
{
BackwardData
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
use_addto
);
}
}
// set filter desc
private:
filter_desc_
.
set
(
filter_shape
,
format_
,
dtype_
,
group
);
void
BackwardFilter
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_WDESC
,
filter_desc_
.
desc
());
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
filter_grad_ptr
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
// set output desc
CudnnFusionOp
*
wgrad_op
=
GetBackwardFilterOp
(
ctx
);
out_desc_
.
set
(
output_shape
,
format_
,
dtype_
);
size_t
workspace_size
=
RoundUp
(
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YDESC
,
out_desc_
.
desc
());
static_cast
<
int64_t
>
(
wgrad_op
->
GetWorkspaceSizeInBytes
(
cudnn_handle
)),
512
);
// set output_stats desc
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_XDATA
,
input_ptr
);
out_stats_desc_
.
set
(
stats_shape
,
format_
,
cudnn_fwd_compute_type_
);
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DYDATA
,
output_grad_ptr
);
fwd_op_
.
SetOpConstParamDesc
(
CUDNN_PARAM_YSTATS_DESC
,
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_DWDATA
,
filter_grad_ptr
);
out_stats_desc_
.
desc
());
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES
,
&
workspace_size
);
fwd_op_
.
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL
);
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
workspace_ptr
)
{
// workspace ptr
wgrad_op
->
SetOpVariantParamAttrPtr
(
CUDNN_PTR_WORKSPACE
,
workspace_ptr
);
// fused op execute
wgrad_op
->
Execute
(
cudnn_handle
);
},
workspace_size
);
}
}
void
GetWorkspaceSize
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
void
BackwardData
(
const
platform
::
CUDADeviceContext
&
ctx
,
T
*
input_ptr
,
auto
handle
=
ctx
.
cudnn_handle
();
T
*
output_grad_ptr
,
T
*
filter_ptr
,
T
*
input_grad_ptr
,
fwd_workspace_byte_
=
fwd_op_
.
GetWorkspaceSizeInBytes
(
handle
);
bool
use_addto
=
false
)
{
auto
cudnn_handle
=
ctx
.
cudnn_handle
();
size_t
workspace_size
=
GetWorkspaceSizeBwdData
(
ctx
);
// Convolution dgrad followed optionally by batchnorm dgrad
ScalingParamType
<
T
>
alpha
=
1.0
f
;
ScalingParamType
<
T
>
beta
=
use_addto
?
1.0
f
:
0.0
f
;
ctx
.
cudnn_workspace_handle
().
RunFunc
(
[
&
](
void
*
cudnn_workspace_ptr
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnConvolutionBackwardData
(
cudnn_handle
,
&
alpha
,
args_
.
filter_desc
.
desc
(),
filter_ptr
,
args_
.
out_desc
.
desc
(),
output_grad_ptr
,
args_
.
conv_desc
.
desc
(),
dgrad_algo_
,
cudnn_workspace_ptr
,
workspace_size
,
&
beta
,
args_
.
in_desc
.
desc
(),
input_grad_ptr
));
},
workspace_size
);
}
}
size_t
fwd_workspace_byte_
=
0
;
CudnnFusionOp
*
GetBackwardFilterOp
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
AlgorithmsCache
<
CudnnFusionOp
*>
&
cache
=
*
(
CudnnFusionOpCache
::
Instance
().
GetBackward
());
CudnnFusionOp
*
wgrad_op
=
cache
.
GetAlgorithm
(
args_
.
in_dims
,
args_
.
filter_dims
,
args_
.
strides
,
args_
.
paddings
,
args_
.
dilations
,
0
,
static_cast
<
int64_t
>
(
args_
.
dtype
),
[
&
]()
{
CudnnFusionOp
*
wgrad_op
=
new
CudnnFusionOp
(
CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD
);
wgrad_op
->
SetOpConstParamAttr
(
{
CUDNN_PARAM_DYDATA_PLACEHOLDER
,
CUDNN_PARAM_XDATA_PLACEHOLDER
,
CUDNN_PARAM_DWDATA_PLACEHOLDER
},
CUDNN_PTR_16B_ALIGNED
);
cudnnDataType_t
dtype_
;
// conv desc
cudnnDataType_t
cudnn_fwd_compute_type_
;
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_CONV_DESC
,
platform
::
TensorDescriptor
in_desc_
;
args_
.
conv_desc
.
desc
());
platform
::
FilterDescriptor
filter_desc_
;
// input desc
platform
::
TensorDescriptor
out_desc_
;
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_XDESC
,
platform
::
TensorDescriptor
out_stats_desc_
;
args_
.
in_desc
.
desc
());
platform
::
ConvolutionDescriptor
conv_desc_
;
// filter desc
cudnnTensorFormat_t
format_
;
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DWDESC
,
args_
.
filter_desc
.
desc
());
// output desc
wgrad_op
->
SetOpConstParamDesc
(
CUDNN_PARAM_DYDESC
,
args_
.
out_desc
.
desc
());
wgrad_op
->
SetOpConstParamAttr
(
CUDNN_PARAM_BN_MODE
,
CUDNN_BATCHNORM_SPATIAL_PERSISTENT
);
CudnnFusionOp
fwd_op_
;
// Make cudnn fused ops plan
wgrad_op
->
GetWorkspaceSizeInBytes
(
ctx
.
cudnn_handle
());
return
wgrad_op
;
});
return
wgrad_op
;
}
size_t
GetWorkspaceSizeBwdData
(
const
platform
::
CUDADeviceContext
&
ctx
)
{
size_t
workspace_size
=
0U
;
auto
handle
=
ctx
.
cudnn_handle
();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
cudnnGetConvolutionBackwardDataWorkspaceSize
(
handle
,
args_
.
filter_desc
.
desc
(),
args_
.
out_desc
.
desc
(),
args_
.
conv_desc
.
desc
(),
args_
.
in_desc
.
desc
(),
dgrad_algo_
,
&
workspace_size
));
return
RoundUp
(
workspace_size
,
512
);
}
private:
NormConvolutionArgs
<
T
>
args_
;
cudnnConvolutionBwdDataAlgo_t
dgrad_algo_
;
};
};
#endif
#endif
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
浏览文件 @
767050d9
...
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
...
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <random>
#include <random>
#include <vector>
#include <vector>
...
@@ -29,23 +30,80 @@ namespace op = paddle::operators;
...
@@ -29,23 +30,80 @@ namespace op = paddle::operators;
using
Tensor
=
paddle
::
framework
::
Tensor
;
using
Tensor
=
paddle
::
framework
::
Tensor
;
USE_OP
(
conv2d
);
USE_OP
(
conv2d
);
USE_OP
(
conv2d_grad
);
USE_OP_DEVICE_KERNEL
(
conv2d
,
CUDNN
);
USE_OP_DEVICE_KERNEL
(
conv2d
,
CUDNN
);
USE_OP_DEVICE_KERNEL
(
conv2d_grad
,
CUDNN
);
template
<
typename
T
>
void
InitRandomTensor
(
const
std
::
vector
<
int64_t
>
&
dims
,
framework
::
Tensor
*
cpu_out
)
{
T
*
cpu_out_ptr
=
cpu_out
->
mutable_data
<
T
>
(
framework
::
make_ddim
(
dims
),
platform
::
CPUPlace
());
std
::
default_random_engine
random
(
0
);
std
::
uniform_real_distribution
<
float
>
dis
(
0.0
,
1.0
);
for
(
int
i
=
0
;
i
<
cpu_out
->
numel
();
++
i
)
{
cpu_out_ptr
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
}
}
// get paddle conv2d op results as baseline
template
<
typename
T
>
template
<
typename
T
>
void
Conv2DForwardCompute
(
const
Tensor
&
x
,
const
Tensor
&
w
,
Tensor
*
y
,
void
TransposeNchwToNhwc
(
const
framework
::
Tensor
&
cpu_in
,
const
platform
::
CUDADeviceContext
&
ctx
)
{
framework
::
Tensor
*
cpu_out
)
{
auto
in_dims
=
cpu_in
.
dims
();
EXPECT_EQ
(
cpu_in
.
dims
().
size
(),
4
);
const
T
*
cpu_in_ptr
=
cpu_in
.
data
<
T
>
();
T
*
cpu_out_ptr
=
cpu_out
->
mutable_data
<
T
>
(
{
in_dims
[
0
],
in_dims
[
2
],
in_dims
[
3
],
in_dims
[
1
]},
platform
::
CPUPlace
());
int64_t
n
=
in_dims
[
0
];
int64_t
c
=
in_dims
[
1
];
int64_t
hw
=
in_dims
[
2
]
*
in_dims
[
3
];
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
j
=
0
;
j
<
hw
;
++
j
)
{
for
(
int
k
=
0
;
k
<
c
;
++
k
)
{
int
dst_idx
=
i
*
hw
*
c
+
j
*
c
+
k
;
int
src_idx
=
i
*
c
*
hw
+
k
*
hw
+
j
;
cpu_out_ptr
[
dst_idx
]
=
cpu_in_ptr
[
src_idx
];
}
}
}
}
template
<
typename
T
>
void
CheckOutput
(
const
framework
::
Tensor
&
cpu_res
,
const
framework
::
Tensor
&
cpu_base
,
float
diff
,
bool
is_relative_atol
=
false
)
{
EXPECT_EQ
(
cpu_res
.
dims
(),
cpu_base
.
dims
());
const
T
*
cpu_res_ptr
=
cpu_res
.
data
<
T
>
();
const
T
*
cpu_base_ptr
=
cpu_base
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
cpu_res
.
numel
();
++
i
)
{
if
(
is_relative_atol
)
{
EXPECT_LT
(
static_cast
<
float
>
(
std
::
abs
((
cpu_res_ptr
[
i
]
-
cpu_base_ptr
[
i
])
/
cpu_base_ptr
[
i
])),
diff
);
}
else
{
EXPECT_LT
(
static_cast
<
float
>
(
std
::
abs
(
cpu_res_ptr
[
i
]
-
cpu_base_ptr
[
i
])),
diff
);
}
}
}
// Use Paddle conv2d op results as baseline
template
<
typename
T
>
void
ComputeConv2DForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
Tensor
*
cpu_output
)
{
framework
::
Scope
scope
;
framework
::
Scope
scope
;
auto
var_x
=
scope
.
Var
(
"Input"
);
auto
*
input
=
scope
.
Var
(
"Input"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
tensor_x
=
var_x
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
filter
=
scope
.
Var
(
"Filter"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
var_w
=
scope
.
Var
(
"Filter"
);
auto
*
output
=
scope
.
Var
(
"Output"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
tensor_w
=
var_w
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
var_y
=
scope
.
Var
(
"Output"
);
auto
tensor_y
=
var_y
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
place
=
ctx
.
GetPlace
();
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
x
,
place
,
tensor_x
);
TensorCopySync
(
cpu_input
,
place
,
input
);
TensorCopySync
(
w
,
place
,
tensor_w
);
TensorCopySync
(
cpu_filter
,
place
,
filter
);
framework
::
AttributeMap
attrs
;
framework
::
AttributeMap
attrs
;
bool
use_cudnn
=
true
;
bool
use_cudnn
=
true
;
...
@@ -60,25 +118,94 @@ void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y,
...
@@ -60,25 +118,94 @@ void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y,
{{
"Output"
,
{
"Output"
}}},
attrs
);
{{
"Output"
,
{
"Output"
}}},
attrs
);
op
->
Run
(
scope
,
ctx
.
GetPlace
());
op
->
Run
(
scope
,
ctx
.
GetPlace
());
TensorCopySync
(
*
tensor_y
,
place
,
y
);
TensorCopySync
(
*
output
,
platform
::
CPUPlace
(),
cpu_output
);
ctx
.
Wait
();
}
}
// Use Paddle conv2d_grad op results as baseline
template
<
typename
T
>
template
<
typename
T
>
class
TestCudnnNormConvOpForward
{
void
ComputeConv2DBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
public:
const
Tensor
&
cpu_input
,
const
Tensor
&
cpu_filter
,
TestCudnnNormConvOpForward
()
{
const
Tensor
&
cpu_output_grad
,
batch_size_
=
2
;
framework
::
Tensor
*
cpu_input_grad
,
height_
=
8
;
framework
::
Tensor
*
cpu_filter_grad
,
int
stride
,
width_
=
8
;
int
padding
,
int
dilation
)
{
input_channels_
=
8
;
framework
::
Scope
scope
;
output_channels_
=
32
;
auto
*
input
=
scope
.
Var
(
"Input"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
kernel_size_
=
1
;
auto
*
filter
=
scope
.
Var
(
"Filter"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
stride_
=
1
;
auto
*
output_grad
=
pad_
=
0
;
scope
.
Var
(
"Output@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
input_grad
=
scope
.
Var
(
"Input@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
*
filter_grad
=
scope
.
Var
(
"Filter@GRAD"
)
->
GetMutable
<
framework
::
LoDTensor
>
();
auto
place
=
ctx
.
GetPlace
();
TensorCopySync
(
cpu_input
,
place
,
input
);
TensorCopySync
(
cpu_filter
,
place
,
filter
);
TensorCopySync
(
cpu_output_grad
,
place
,
output_grad
);
framework
::
AttributeMap
attrs
;
bool
use_cudnn
=
true
;
std
::
string
data_format
=
"NHWC"
;
std
::
string
padding_algorithm
=
"SAME"
;
std
::
vector
<
int
>
strides
=
{
stride
,
stride
};
std
::
vector
<
int
>
paddings
=
{
padding
,
padding
};
std
::
vector
<
int
>
dilations
=
{
dilation
,
dilation
};
int
groups
=
1
;
bool
exhaustive_search
=
false
;
bool
use_addto
=
false
;
attrs
.
insert
({
"use_cudnn"
,
use_cudnn
});
attrs
.
insert
({
"data_format"
,
data_format
});
attrs
.
insert
({
"padding_algorithm"
,
padding_algorithm
});
attrs
.
insert
({
"strides"
,
strides
});
attrs
.
insert
({
"paddings"
,
paddings
});
attrs
.
insert
({
"dilations"
,
dilations
});
attrs
.
insert
({
"groups"
,
groups
});
attrs
.
insert
({
"exhaustive_search"
,
exhaustive_search
});
attrs
.
insert
({
"use_addto"
,
use_addto
});
auto
op
=
framework
::
OpRegistry
::
CreateOp
(
"conv2d_grad"
,
{{
"Input"
,
{
"Input"
}},
{
"Filter"
,
{
"Filter"
}},
{
"Output@GRAD"
,
{
"Output@GRAD"
}}},
{{
"Input@GRAD"
,
{
"Input@GRAD"
}},
{
"Filter@GRAD"
,
{
"Filter@GRAD"
}}},
attrs
);
op
->
Run
(
scope
,
ctx
.
GetPlace
());
TensorCopySync
(
*
input_grad
,
platform
::
CPUPlace
(),
cpu_input_grad
);
TensorCopySync
(
*
filter_grad
,
platform
::
CPUPlace
(),
cpu_filter_grad
);
}
template
<
typename
T
>
void
ComputeSumAndSquareSum
(
const
framework
::
Tensor
&
cpu_out
,
framework
::
Tensor
*
cpu_sum
,
framework
::
Tensor
*
cpu_sum_of_square
)
{
auto
dims
=
cpu_out
.
dims
();
int64_t
c
=
dims
[
3
];
const
T
*
cpu_out_ptr
=
cpu_out
.
data
<
T
>
();
float
*
cpu_sum_ptr
=
cpu_sum
->
mutable_data
<
float
>
({
1
,
1
,
1
,
c
},
platform
::
CPUPlace
());
float
*
cpu_sum_square_ptr
=
cpu_sum_of_square
->
mutable_data
<
float
>
(
{
1
,
1
,
1
,
c
},
platform
::
CPUPlace
());
for
(
int
j
=
0
;
j
<
c
;
++
j
)
{
float
tmp_sum
=
0.0
f
;
float
tmp_sum_of_squares
=
0.0
f
;
for
(
int
i
=
0
;
i
<
cpu_out
.
numel
()
/
c
;
++
i
)
{
float
tmp_out
=
static_cast
<
float
>
(
cpu_out_ptr
[
i
*
c
+
j
]);
tmp_sum
+=
tmp_out
;
tmp_sum_of_squares
+=
tmp_out
*
tmp_out
;
}
cpu_sum_ptr
[
j
]
=
tmp_sum
;
cpu_sum_square_ptr
[
j
]
=
tmp_sum_of_squares
;
}
}
}
TestCudnnNormConvOpForward
(
int
batch_size
,
int
height
,
int
width
,
template
<
typename
T
>
class
CudnnNormConvolutionTester
{
public:
CudnnNormConvolutionTester
(
int
batch_size
,
int
height
,
int
width
,
int
input_channels
,
int
output_channels
,
int
input_channels
,
int
output_channels
,
int
kernel_size
,
int
stride
)
{
int
kernel_size
,
int
stride
)
{
batch_size_
=
batch_size
;
batch_size_
=
batch_size
;
...
@@ -88,133 +215,183 @@ class TestCudnnNormConvOpForward {
...
@@ -88,133 +215,183 @@ class TestCudnnNormConvOpForward {
output_channels_
=
output_channels
;
output_channels_
=
output_channels
;
kernel_size_
=
kernel_size
;
kernel_size_
=
kernel_size
;
stride_
=
stride
;
stride_
=
stride
;
pad_
=
(
kernel_size_
-
1
)
/
2
;
padding_
=
(
kernel_size_
-
1
)
/
2
;
SetUp
();
}
}
~
TestCudnnNormConvOpForward
()
{}
~
CudnnNormConvolutionTester
()
{}
void
CheckForward
(
float
diff
,
bool
is_relative_atol
=
false
)
{
platform
::
CUDADeviceContext
*
ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
framework
::
Tensor
cpu_output_base
;
framework
::
Tensor
cpu_sum_base
;
framework
::
Tensor
cpu_sum_of_square_base
;
BaselineForward
(
*
ctx
,
&
cpu_output_base
,
&
cpu_sum_base
,
&
cpu_sum_of_square_base
);
framework
::
Tensor
cpu_output
;
framework
::
Tensor
cpu_sum
;
framework
::
Tensor
cpu_sum_of_square
;
FusedForward
(
*
ctx
,
&
cpu_output
,
&
cpu_sum
,
&
cpu_sum_of_square
);
// Check forward correctness between baseline and results of normconv.
CheckOutput
<
T
>
(
cpu_output
,
cpu_output_base
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
cpu_sum
,
cpu_sum_base
,
diff
,
is_relative_atol
);
CheckOutput
<
float
>
(
cpu_sum_of_square
,
cpu_sum_of_square_base
,
diff
,
is_relative_atol
);
}
void
CheckBackward
(
float
diff
,
bool
is_relative_atol
=
false
)
{
platform
::
CUDADeviceContext
*
ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
platform
::
CUDAPlace
(
0
)));
framework
::
Tensor
cpu_input_grad_base
;
framework
::
Tensor
cpu_filter_nchw_grad_base
;
framework
::
Tensor
cpu_filter_nhwc_grad_base
;
BaselineBackward
(
*
ctx
,
&
cpu_input_grad_base
,
&
cpu_filter_nchw_grad_base
);
TransposeNchwToNhwc
<
T
>
(
cpu_filter_nchw_grad_base
,
&
cpu_filter_nhwc_grad_base
);
framework
::
Tensor
cpu_input_grad
;
framework
::
Tensor
cpu_filter_nhwc_grad
;
FusedBackward
(
*
ctx
,
&
cpu_input_grad
,
&
cpu_filter_nhwc_grad
);
// Check backward correctness between baseline and results of normconv.
CheckOutput
<
T
>
(
cpu_input_grad
,
cpu_input_grad_base
,
diff
,
is_relative_atol
);
CheckOutput
<
T
>
(
cpu_filter_nhwc_grad
,
cpu_filter_nhwc_grad_base
,
diff
,
is_relative_atol
);
}
private:
void
SetUp
()
{
void
SetUp
()
{
input_size_
=
batch_size_
*
height_
*
width_
*
input_channels_
;
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
input_channels_
},
filter_size_
=
&
cpu_input_
);
output_channels_
*
input_channels_
*
kernel_size_
*
kernel_size_
;
InitRandomTensor
<
T
>
(
output_size_
=
batch_size_
*
height_
*
width_
*
output_channels_
;
{
output_channels_
,
input_channels_
,
kernel_size_
,
kernel_size_
},
param_size_
=
output_channels_
;
&
cpu_filter_nchw_
);
// transpoes for filter, NCHW -> NHWC
input_vec_
.
resize
(
input_size_
);
TransposeNchwToNhwc
<
T
>
(
cpu_filter_nchw_
,
&
cpu_filter_nhwc_
);
filter_raw_vec_
.
resize
(
filter_size_
);
InitRandomTensor
<
T
>
({
batch_size_
,
height_
,
width_
,
output_channels_
},
filter_pro_vec_
.
resize
(
filter_size_
);
&
cpu_output_grad_
);
}
std
::
default_random_engine
random
(
0
);
std
::
uniform_real_distribution
<
float
>
dis
(
0.0
,
1.0
);
for
(
int
i
=
0
;
i
<
input_size_
;
++
i
)
{
input_vec_
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
}
for
(
int
i
=
0
;
i
<
filter_size_
;
++
i
)
{
filter_raw_vec_
[
i
]
=
static_cast
<
T
>
(
dis
(
random
));
}
// transpoes for filter
// NCHW->NHWC
for
(
int
oc
=
0
;
oc
<
output_channels_
;
++
oc
)
{
for
(
int
kh
=
0
;
kh
<
kernel_size_
;
++
kh
)
{
for
(
int
kw
=
0
;
kw
<
kernel_size_
;
++
kw
)
{
for
(
int
ic
=
0
;
ic
<
input_channels_
;
++
ic
)
{
int
dst_idx
=
oc
*
kernel_size_
*
kernel_size_
*
input_channels_
+
kh
*
kernel_size_
*
input_channels_
+
kw
*
input_channels_
+
ic
;
int
src_idx
=
oc
*
kernel_size_
*
kernel_size_
*
input_channels_
+
ic
*
kernel_size_
*
kernel_size_
+
kh
*
kernel_size_
+
kw
;
filter_pro_vec_
[
dst_idx
]
=
filter_raw_vec_
[
src_idx
];
}
}
}
}
framework
::
TensorFromVector
<
T
>
(
input_vec_
,
*
ctx_
,
&
input_
);
void
BaselineForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
input_
.
Resize
({
batch_size_
,
height_
,
width_
,
input_channels_
});
framework
::
Tensor
*
cpu_output_base
,
framework
::
TensorFromVector
<
T
>
(
filter_raw_vec_
,
*
ctx_
,
&
filter_raw_
);
framework
::
Tensor
*
cpu_sum_base
,
filter_raw_
.
Resize
(
framework
::
Tensor
*
cpu_sum_of_square_base
)
{
{
output_channels_
,
input_channels_
,
kernel_size_
,
kernel_size_
});
ComputeConv2DForward
<
T
>
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
cpu_output_base
);
framework
::
TensorFromVector
<
T
>
(
filter_pro_vec_
,
*
ctx_
,
&
filter_pro_
);
ComputeSumAndSquareSum
<
T
>
(
*
cpu_output_base
,
cpu_sum_base
,
filter_pro_
.
Resize
(
cpu_sum_of_square_base
);
{
output_channels_
,
kernel_size_
,
kernel_size_
,
input_channels_
});
output_
.
Resize
({
batch_size_
,
height_
,
width_
,
output_channels_
});
base_output_
.
Resize
({
batch_size_
,
height_
,
width_
,
output_channels_
});
sum_
.
Resize
({
1
,
1
,
1
,
output_channels_
});
sum_of_squares_
.
Resize
({
1
,
1
,
1
,
output_channels_
});
ctx_
->
Wait
();
}
}
void
BaselineForward
()
{
void
BaselineBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
Conv2DForwardCompute
<
T
>
(
input_
,
filter_raw_
,
&
base_output_
,
*
ctx_
);
framework
::
Tensor
*
cpu_input_grad_base
,
ctx_
->
Wait
();
framework
::
Tensor
*
cpu_filter_grad_base
)
{
ComputeConv2DBackward
<
T
>
(
ctx
,
cpu_input_
,
cpu_filter_nchw_
,
cpu_output_grad_
,
cpu_input_grad_base
,
cpu_filter_grad_base
,
stride_
,
padding_
,
dilation_
);
}
}
// get forward results of cudnn_norm_conv
// get forward results of cudnn_norm_conv
void
FusedForward
()
{
void
FusedForward
(
const
platform
::
CUDADeviceContext
&
ctx
,
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input_
.
dims
());
framework
::
Tensor
*
cpu_output
,
framework
::
Tensor
*
cpu_sum
,
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_pro_
.
dims
());
framework
::
Tensor
*
cpu_sum_of_square
)
{
auto
output_shape
=
framework
::
vectorize
<
int
>
(
output_
.
dims
());
framework
::
Tensor
input
;
T
*
input_ptr
=
input_
.
data
<
T
>
();
framework
::
Tensor
filter_nhwc
;
T
*
filter_ptr
=
filter_pro_
.
data
<
T
>
();
framework
::
Tensor
output
;
T
*
output_ptr
=
output_
.
mutable_data
<
T
>
(
place_
);
framework
::
Tensor
sum
;
float
*
sum_ptr
=
sum_
.
mutable_data
<
float
>
(
place_
);
framework
::
Tensor
sum_of_square
;
float
*
sum_of_squares_ptr
=
sum_of_squares_
.
mutable_data
<
float
>
(
place_
);
std
::
shared_ptr
<
op
::
CudnnNormConvolutionOp
<
T
>>
conv_op
(
new
op
::
CudnnNormConvolutionOp
<
T
>
());
conv_op
->
Init
(
*
ctx_
,
input_shape
,
filter_shape
,
output_shape
,
pad_
,
stride_
,
dilate_
,
group_
);
conv_op
->
Forward
(
*
ctx_
,
input_ptr
,
filter_ptr
,
output_ptr
,
sum_ptr
,
sum_of_squares_ptr
);
ctx_
->
Wait
();
}
void
Run
()
{
auto
place
=
ctx
.
GetPlace
();
SetUp
();
TensorCopySync
(
cpu_input_
,
place
,
&
input
);
BaselineForward
();
TensorCopySync
(
cpu_filter_nhwc_
,
place
,
&
filter_nhwc
);
FusedForward
();
T
*
input_ptr
=
input
.
data
<
T
>
();
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
T
*
output_ptr
=
output
.
mutable_data
<
T
>
(
{
batch_size_
,
height_
,
width_
,
output_channels_
},
place
);
float
*
sum_ptr
=
sum
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
float
*
sum_of_square_ptr
=
sum_of_square
.
mutable_data
<
float
>
({
1
,
1
,
1
,
output_channels_
},
place
);
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input
.
dims
());
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_nhwc
.
dims
());
auto
output_shape
=
framework
::
vectorize
<
int
>
(
output
.
dims
());
op
::
CudnnNormConvolution
<
T
>
conv_op
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding_
,
stride_
,
dilation_
,
group_
);
conv_op
.
Forward
(
ctx
,
input_ptr
,
filter_ptr
,
output_ptr
,
sum_ptr
,
sum_of_square_ptr
);
TensorCopySync
(
output
,
platform
::
CPUPlace
(),
cpu_output
);
TensorCopySync
(
sum
,
platform
::
CPUPlace
(),
cpu_sum
);
TensorCopySync
(
sum_of_square
,
platform
::
CPUPlace
(),
cpu_sum_of_square
);
}
}
// check forward correctness between baseline and results of normconv.
void
FusedBackward
(
const
platform
::
CUDADeviceContext
&
ctx
,
void
CheckOut
(
const
T
diff
,
bool
is_relative_atol
=
false
)
{
framework
::
Tensor
*
cpu_input_grad
,
std
::
vector
<
T
>
base_output_vec
,
output_vec
;
framework
::
Tensor
*
cpu_filter_grad
)
{
output_vec
.
resize
(
output_size_
);
framework
::
Tensor
input
;
base_output_vec
.
resize
(
output_size_
);
framework
::
Tensor
filter_nhwc
;
TensorToVector
(
base_output_
,
*
ctx_
,
&
base_output_vec
);
framework
::
Tensor
output_grad
;
TensorToVector
(
output_
,
*
ctx_
,
&
output_vec
);
framework
::
Tensor
input_grad
;
ctx_
->
Wait
();
framework
::
Tensor
filter_grad
;
for
(
int
i
=
0
;
i
<
output_size_
;
++
i
)
{
auto
place
=
ctx
.
GetPlace
();
if
(
is_relative_atol
)
{
TensorCopySync
(
cpu_input_
,
place
,
&
input
);
EXPECT_LT
(
TensorCopySync
(
cpu_filter_nhwc_
,
place
,
&
filter_nhwc
);
std
::
abs
((
output_vec
[
i
]
-
base_output_vec
[
i
])
/
base_output_vec
[
i
]),
TensorCopySync
(
cpu_output_grad_
,
place
,
&
output_grad
);
diff
);
}
else
{
T
*
input_ptr
=
input
.
data
<
T
>
();
EXPECT_LT
(
std
::
abs
(
output_vec
[
i
]
-
base_output_vec
[
i
]),
diff
);
T
*
filter_ptr
=
filter_nhwc
.
data
<
T
>
();
}
T
*
output_grad_ptr
=
output_grad
.
data
<
T
>
();
}
T
*
input_grad_ptr
=
input_grad
.
mutable_data
<
T
>
(
input
.
dims
(),
place
);
T
*
filter_grad_ptr
=
filter_grad
.
mutable_data
<
T
>
(
filter_nhwc
.
dims
(),
place
);
auto
input_shape
=
framework
::
vectorize
<
int
>
(
input
.
dims
());
auto
filter_shape
=
framework
::
vectorize
<
int
>
(
filter_nhwc
.
dims
());
auto
output_shape
=
framework
::
vectorize
<
int
>
(
output_grad
.
dims
());
op
::
CudnnNormConvolutionGrad
<
T
>
conv_grad_op
(
ctx
,
input_shape
,
filter_shape
,
output_shape
,
padding_
,
stride_
,
dilation_
,
group_
);
conv_grad_op
.
Backward
(
ctx
,
input_ptr
,
output_grad_ptr
,
filter_ptr
,
input_grad_ptr
,
filter_grad_ptr
);
TensorCopySync
(
input_grad
,
platform
::
CPUPlace
(),
cpu_input_grad
);
TensorCopySync
(
filter_grad
,
platform
::
CPUPlace
(),
cpu_filter_grad
);
}
}
private:
private:
int
batch_size_
,
height_
,
width_
,
input_channels_
,
output_channels_
;
int
batch_size_
;
int
kernel_size_
,
stride_
,
pad_
;
int
height_
;
const
int
dilate_
=
1
;
int
width_
;
int
input_channels_
;
int
output_channels_
;
int
kernel_size_
;
int
stride_
;
int
padding_
;
const
int
dilation_
=
1
;
const
int
group_
=
1
;
const
int
group_
=
1
;
int
input_size_
,
filter_size_
,
output_size_
,
param_size_
;
framework
::
Tensor
input_
,
filter_raw_
,
filter_pro_
,
output_
,
base_output_
;
// Forward input
framework
::
Tensor
sum_
,
sum_of_squares_
;
framework
::
Tensor
cpu_input_
;
std
::
vector
<
T
>
input_vec_
,
filter_raw_vec_
,
filter_pro_vec_
;
framework
::
Tensor
cpu_filter_nchw_
;
framework
::
Tensor
cpu_filter_nhwc_
;
platform
::
CUDAPlace
place_
=
platform
::
CUDAPlace
(
0
);
// Backward input
platform
::
CUDADeviceContext
*
ctx_
=
framework
::
Tensor
cpu_output_grad_
;
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
));
};
};
// test for fp16, kernel = 1, output_channels = input_channels
// test for fp16, kernel = 1, output_channels = input_channels
TEST
(
CudnnNormConvF
orward
,
GPUCudnnNormConvForward1Fp16
)
{
TEST
(
CudnnNormConvF
p16
,
K1S1
)
{
int
batch_size
=
4
;
int
batch_size
=
4
;
int
height
=
56
;
int
height
=
56
;
int
width
=
56
;
int
width
=
56
;
...
@@ -222,15 +399,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) {
...
@@ -222,15 +399,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) {
int
output_channels
=
32
;
int
output_channels
=
32
;
int
kernel_size
=
1
;
int
kernel_size
=
1
;
int
stride
=
1
;
int
stride
=
1
;
TestCudnnNormConvOpForward
<
paddle
::
platform
::
float16
>
test
(
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
stride
);
test
.
Run
(
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
Check
Out
(
static_cast
<
paddle
::
platform
::
float16
>
(
1e-3
)
,
true
);
test
.
Check
Backward
(
1e-3
,
true
);
}
}
// test for fp16, kernel = 3, output_channels = input_channels
// test for fp16, kernel = 3, output_channels = input_channels
TEST
(
CudnnNormConvF
orward
,
GPUCudnnNormConvForward2Fp16
)
{
TEST
(
CudnnNormConvF
p16
,
K3S1
)
{
int
batch_size
=
4
;
int
batch_size
=
4
;
int
height
=
56
;
int
height
=
56
;
int
width
=
56
;
int
width
=
56
;
...
@@ -238,15 +415,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) {
...
@@ -238,15 +415,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) {
int
output_channels
=
32
;
int
output_channels
=
32
;
int
kernel_size
=
3
;
int
kernel_size
=
3
;
int
stride
=
1
;
int
stride
=
1
;
TestCudnnNormConvOpForward
<
paddle
::
platform
::
float16
>
test
(
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
stride
);
test
.
Run
(
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
Check
Out
(
static_cast
<
paddle
::
platform
::
float16
>
(
1e-3
)
,
true
);
test
.
Check
Backward
(
1e-3
,
true
);
}
}
// test for fp16, kernel = 1, output_channels = input_channels * 4
// test for fp16, kernel = 1, output_channels = input_channels * 4
TEST
(
CudnnNormConvF
orward
,
GPUCudnnNormConvForward3Fp16
)
{
TEST
(
CudnnNormConvF
p16
,
K1S1O4
)
{
int
batch_size
=
4
;
int
batch_size
=
4
;
int
height
=
56
;
int
height
=
56
;
int
width
=
56
;
int
width
=
56
;
...
@@ -254,9 +431,9 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) {
...
@@ -254,9 +431,9 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) {
int
output_channels
=
128
;
int
output_channels
=
128
;
int
kernel_size
=
1
;
int
kernel_size
=
1
;
int
stride
=
1
;
int
stride
=
1
;
TestCudnnNormConvOpForward
<
paddle
::
platform
::
float16
>
test
(
CudnnNormConvolutionTester
<
paddle
::
platform
::
float16
>
test
(
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
batch_size
,
height
,
width
,
input_channels
,
output_channels
,
kernel_size
,
stride
);
stride
);
test
.
Run
(
);
test
.
CheckForward
(
1e-3
,
true
);
test
.
Check
Out
(
static_cast
<
paddle
::
platform
::
float16
>
(
1e-3
)
,
true
);
test
.
Check
Backward
(
1e-3
,
true
);
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录