Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
c240355c
O
Opencv
项目概览
Greenplum
/
Opencv
10 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
c240355c
编写于
12月 15, 2020
作者:
A
Alexander Alekhin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
dnn(ocl): avoid mess FP16/FP32 in convolution layer
上级
1bfc75ac
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
77 addition
and
61 deletion
+77
-61
modules/core/src/convert.dispatch.cpp
modules/core/src/convert.dispatch.cpp
+1
-1
modules/core/src/opencl/halfconvert.cl
modules/core/src/opencl/halfconvert.cl
+11
-2
modules/dnn/src/layers/convolution_layer.cpp
modules/dnn/src/layers/convolution_layer.cpp
+13
-13
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
+0
-2
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
+46
-42
modules/dnn/src/opencl/conv_spatial_helper.cl
modules/dnn/src/opencl/conv_spatial_helper.cl
+6
-1
未找到文件。
modules/core/src/convert.dispatch.cpp
浏览文件 @
c240355c
...
...
@@ -138,7 +138,7 @@ static bool ocl_convertFp16( InputArray _src, OutputArray _dst, int sdepth, int
sdepth
==
CV_32F
?
"half"
:
"float"
,
rowsPerWI
,
sdepth
==
CV_32F
?
" -D FLOAT_TO_HALF "
:
""
);
ocl
::
Kernel
k
(
"convertFp16
"
,
ocl
::
core
::
halfconvert_oclsrc
,
build_opt
);
ocl
::
Kernel
k
(
sdepth
==
CV_32F
?
"convertFp16_FP32_to_FP16"
:
"convertFp16_FP16_to_FP32
"
,
ocl
::
core
::
halfconvert_oclsrc
,
build_opt
);
if
(
k
.
empty
())
return
false
;
...
...
modules/core/src/opencl/halfconvert.cl
浏览文件 @
c240355c
...
...
@@ -47,8 +47,17 @@
#
endif
#
endif
__kernel
void
convertFp16
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
__global
uchar
*
dstptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols
)
__kernel
void
#
ifdef
FLOAT_TO_HALF
convertFp16_FP32_to_FP16
#
else
convertFp16_FP16_to_FP32
#
endif
(
__global
const
uchar
*
srcptr,
int
src_step,
int
src_offset,
__global
uchar
*
dstptr,
int
dst_step,
int
dst_offset,
int
dst_rows,
int
dst_cols
)
{
int
x
=
get_global_id
(
0
)
;
int
y0
=
get_global_id
(
1
)
*
rowsPerWI
;
...
...
modules/dnn/src/layers/convolution_layer.cpp
浏览文件 @
c240355c
...
...
@@ -1461,16 +1461,7 @@ public:
umat_blobs
.
resize
(
n
);
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
if
(
use_half
)
{
Mat
matFP32
;
convertFp16
(
inputs
[
i
+
1
],
matFP32
);
matFP32
.
copyTo
(
umat_blobs
[
i
]);
}
else
{
inputs
[
i
+
1
].
copyTo
(
umat_blobs
[
i
]);
}
inputs
[
i
+
1
].
copyTo
(
umat_blobs
[
i
]);
}
inputs
.
resize
(
1
);
}
...
...
@@ -1481,7 +1472,10 @@ public:
umat_blobs
.
resize
(
n
);
for
(
size_t
i
=
0
;
i
<
n
;
i
++
)
{
blobs
[
i
].
copyTo
(
umat_blobs
[
i
]);
if
(
use_half
)
convertFp16
(
blobs
[
i
],
umat_blobs
[
i
]);
else
blobs
[
i
].
copyTo
(
umat_blobs
[
i
]);
}
}
...
...
@@ -1537,14 +1531,20 @@ public:
if
(
fusedWeights
)
{
weightsMat
.
copyTo
(
umat_blobs
[
0
]);
if
(
use_half
)
convertFp16
(
weightsMat
,
umat_blobs
[
0
]);
else
weightsMat
.
copyTo
(
umat_blobs
[
0
]);
fusedWeights
=
false
;
}
if
(
fusedBias
)
{
if
(
umat_blobs
.
size
()
<
2
)
umat_blobs
.
resize
(
2
);
umat_blobs
[
1
]
=
UMat
(
biasvec
,
true
);
if
(
use_half
)
convertFp16
(
Mat
(
biasvec
,
true
),
umat_blobs
[
1
]);
else
Mat
(
biasvec
,
true
).
copyTo
(
umat_blobs
[
1
]);
convolutionOp
->
setBias
(
true
);
fusedBias
=
false
;
}
...
...
modules/dnn/src/ocl4dnn/include/ocl4dnn.hpp
浏览文件 @
c240355c
...
...
@@ -274,8 +274,6 @@ class OCL4DNNConvSpatial
int32_t
group_
;
bool
bias_term_
;
UMat
swizzled_weights_umat
;
UMat
weights_half
;
UMat
bias_half
;
UMat
bottom_data2_
;
int32_t
bottom_index_
;
...
...
modules/dnn/src/ocl4dnn/src/ocl4dnn_conv_spatial.cpp
浏览文件 @
c240355c
...
...
@@ -588,16 +588,16 @@ bool OCL4DNNConvSpatial<Dtype>::Forward(const UMat& bottom,
fused_eltwise_
=
false
;
}
if
(
use_half_
&&
bias_half
.
empty
()
&&
!
bias
.
empty
())
convertFp16
(
bias
,
bias_half
);
if
(
use_half_
&&
!
bias
.
empty
())
CV_CheckTypeEQ
(
bias
.
type
(),
CV_16SC1
,
""
);
if
(
use_half_
&&
weights_half
.
empty
()
)
convertFp16
(
weight
,
weights_half
);
if
(
use_half_
)
CV_CheckTypeEQ
(
weight
.
type
(),
CV_16SC1
,
""
);
prepareKernel
(
bottom
,
top
,
weight
,
(
use_half_
)
?
bias_half
:
bias
,
numImages
);
prepareKernel
(
bottom
,
top
,
weight
,
bias
,
numImages
);
if
(
bestKernelConfig
.
empty
())
return
false
;
return
convolve
(
bottom
,
top
,
weight
,
(
use_half_
)
?
bias_half
:
bias
,
numImages
,
bestKernelConfig
);
return
convolve
(
bottom
,
top
,
weight
,
bias
,
numImages
,
bestKernelConfig
);
}
template
<
typename
Dtype
>
...
...
@@ -744,29 +744,26 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
kernel_h_
*
(
int
)
alignSize
(
kernel_w_
,
2
),
(
use_half_
)
?
CV_16SC1
:
CV_32FC1
);
UMat
swizzled_weights_tmp
;
if
(
use_half_
)
swizzled_weights_tmp
.
create
(
shape
(
swizzled_weights_umat
),
CV_32F
);
if
(
!
interleave
)
{
cl_uint
argIdx
=
0
;
int32_t
channels
=
channels_
/
group_
;
ocl
::
Kernel
oclk_copy_weight
(
CL_KERNEL_SELECT
(
"copyWeightsSwizzled"
),
cv
::
ocl
::
dnn
::
conv_spatial_helper_oclsrc
);
ocl
::
Kernel
oclk_copy_weight
(
use_half_
?
"copyWeightsSwizzled_half"
:
"copyWeightsSwizzled_float"
,
cv
::
ocl
::
dnn
::
conv_spatial_helper_oclsrc
,
use_half_
?
"-DHALF_SUPPORT=1 -DDtype=half"
:
"-DDtype=float"
);
if
(
oclk_copy_weight
.
empty
())
return
false
;
oclk_copy_weight
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weight
));
if
(
use_half_
)
oclk_copy_weight
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrWriteOnly
(
swizzled_weights_tmp
));
else
oclk_copy_weight
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrWriteOnly
(
swizzled_weights_umat
));
oclk_copy_weight
.
set
(
argIdx
++
,
kernel_w_
);
oclk_copy_weight
.
set
(
argIdx
++
,
kernel_h_
);
oclk_copy_weight
.
set
(
argIdx
++
,
channels
);
oclk_copy_weight
.
set
(
argIdx
++
,
num_output_
);
oclk_copy_weight
.
set
(
argIdx
++
,
swizzled_factor
);
oclk_copy_weight
.
args
(
ocl
::
KernelArg
::
PtrReadOnly
(
weight
),
ocl
::
KernelArg
::
PtrWriteOnly
(
swizzled_weights_umat
),
kernel_w_
,
kernel_h_
,
channels
,
num_output_
,
swizzled_factor
);
size_t
global_work_size_copy
[
3
]
=
{
(
size_t
)
(
alignSize
(
num_output_
,
swizzled_factor
)
*
channels
*
kernel_w_
*
kernel_h_
),
1
,
1
};
...
...
@@ -778,13 +775,24 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
}
}
else
{
// assumption: kernel dimension is 2
Mat
weightMat
=
weight
.
getMat
(
ACCESS_READ
);
Dtype
*
cpu_weight
=
(
Dtype
*
)
weightMat
.
ptr
<
float
>
();
Mat
weightMat
;
Mat
swizzledWeightMat
;
UMat
weight_tmp
;
// FP32 in half mode, TODO implement FP16 repack
if
(
use_half_
)
swizzledWeightMat
=
swizzled_weights_tmp
.
getMat
(
ACCESS_WRITE
);
{
CV_CheckTypeEQ
(
weight
.
type
(),
CV_16SC1
,
""
);
convertFp16
(
weight
,
weight_tmp
);
weightMat
=
weight_tmp
.
getMat
(
ACCESS_READ
);
swizzledWeightMat
.
create
(
shape
(
swizzled_weights_umat
),
CV_32F
);
}
else
{
weightMat
=
weight
.
getMat
(
ACCESS_READ
);
swizzledWeightMat
=
swizzled_weights_umat
.
getMat
(
ACCESS_WRITE
);
}
CV_CheckTypeEQ
(
weightMat
.
type
(),
CV_32FC1
,
""
);
Dtype
*
cpu_weight
=
(
Dtype
*
)
weightMat
.
ptr
<
float
>
();
Dtype
*
cpu_swizzled_weight
=
(
Dtype
*
)
swizzledWeightMat
.
ptr
<
float
>
();
int
interleavedRows
=
(
kernel_w_
/
2
)
*
2
;
...
...
@@ -792,26 +800,28 @@ bool OCL4DNNConvSpatial<Dtype>::swizzleWeight(const UMat &weight,
int
blockWidth
=
swizzled_factor
;
// should equal to simd size.
int
rowAlignment
=
32
;
size_t
interleaved_filter_size
=
M_
*
kernel_w_
*
kernel_h_
*
channels_
*
sizeof
(
Dtype
);
Dtype
*
tmpSwizzledWeight
=
reinterpret_cast
<
Dtype
*>
(
malloc
(
interleaved_filter_size
));
CHECK_EQ
(
tmpSwizzledWeight
!=
NULL
,
true
)
<<
"Failed to allocate temporary swizzled weight"
;
cv
::
AutoBuffer
<
Dtype
,
0
>
tmpSwizzledWeight
(
interleaved_filter_size
);
for
(
int
od
=
0
;
od
<
M_
;
od
++
)
for
(
int
id
=
0
;
id
<
channels_
;
id
++
)
for
(
int
r
=
0
;
r
<
kernel_h_
;
r
++
)
for
(
int
c
=
0
;
c
<
kernel_w_
;
c
++
)
tmpSwizzledWeight
[((
id
*
kernel_h_
+
r
)
*
kernel_w_
+
c
)
*
M_
+
od
]
=
cpu_weight
[((
od
*
channels_
+
id
)
*
kernel_h_
+
r
)
*
kernel_w_
+
c
];
interleaveMatrix
(
cpu_swizzled_weight
,
tmpSwizzledWeight
,
tmpSwizzledWeight
.
data
()
,
kernel_w_
*
kernel_h_
*
channels_
,
M_
,
interleavedRows
,
nonInterleavedRows
,
blockWidth
,
rowAlignment
);
free
(
tmpSwizzledWeight
);
}
if
(
use_half_
)
convertFp16
(
swizzled_weights_tmp
,
swizzled_weights_umat
);
// unmap OpenCL buffers
weightMat
.
release
();
if
(
use_half_
)
convertFp16
(
swizzledWeightMat
,
swizzled_weights_umat
);
}
return
true
;
}
...
...
@@ -1104,10 +1114,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
cl_uint
argIdx
=
0
;
setFusionArg
(
fused_activ_
,
fused_eltwise_
,
kernel
,
argIdx
);
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
bottom
));
if
(
use_half_
)
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weights_half
));
else
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weight
));
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weight
));
if
(
bias_term_
)
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
bias
));
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrWriteOnly
(
top
));
...
...
@@ -1148,10 +1155,7 @@ bool OCL4DNNConvSpatial<float>::convolve(const UMat &bottom, UMat &top,
setFusionArg
(
fused_activ_
,
fused_eltwise_
,
kernel
,
argIdx
);
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
bottom
));
kernel
.
set
(
argIdx
++
,
image_offset
);
if
(
use_half_
)
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weights_half
));
else
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weight
));
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
weight
));
kernel
.
set
(
argIdx
++
,
kernel_offset
);
if
(
bias_term_
)
kernel
.
set
(
argIdx
++
,
ocl
::
KernelArg
::
PtrReadOnly
(
bias
));
...
...
@@ -1956,7 +1960,7 @@ void OCL4DNNConvSpatial<Dtype>::prepareKernel(const UMat &bottom, UMat &top,
UMat
benchData
(
1
,
numImages
*
top_dim_
,
(
use_half_
)
?
CV_16SC1
:
CV_32FC1
);
calculateBenchmark
(
bottom
,
benchData
,
(
use_half_
)
?
weights_half
:
weight
,
bias
,
numImages
);
calculateBenchmark
(
bottom
,
benchData
,
weight
,
bias
,
numImages
);
if
(
run_auto_tuning_
||
force_auto_tuning_
)
{
...
...
modules/dnn/src/opencl/conv_spatial_helper.cl
浏览文件 @
c240355c
...
...
@@ -39,9 +39,14 @@
//
//M*/
#
ifdef
HALF_SUPPORT
#
ifdef
cl_khr_fp16
#
pragma
OPENCL
EXTENSION
cl_khr_fp16:enable
#
endif
#
endif
#
define
CONCAT
(
A,B
)
A##_##B
#
define
TEMPLATE
(
name,type
)
CONCAT
(
name,type
)
#
define
Dtype
float
__kernel
void
TEMPLATE
(
copyWeightsSwizzled,
Dtype
)
(
__global
Dtype*
weightIn,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录