Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
3c32ad6d
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
3c32ad6d
编写于
5月 26, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/x86): imp avx2 int8 stride2 chanwise conv
GitOrigin-RevId: 288792de4244ae77e73291d8ed7d7fffbf869d04
上级
89374521
变更
13
展开全部
隐藏空白更改
内联
并排
Showing
13 changed file
with
1271 addition
and
170 deletion
+1271
-170
dnn/src/x86/conv_bias/int8/algos.cpp
dnn/src/x86/conv_bias/int8/algos.cpp
+68
-5
dnn/src/x86/conv_bias/int8/algos.h
dnn/src/x86/conv_bias/int8/algos.h
+25
-2
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
+700
-12
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
+19
-0
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
+3
-55
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
+1
-13
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp
+204
-0
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h
+30
-0
dnn/src/x86/conv_bias/int8/chanwise_helper.h
dnn/src/x86/conv_bias/int8/chanwise_helper.h
+83
-0
dnn/src/x86/conv_bias/int8/common_helper.h
dnn/src/x86/conv_bias/int8/common_helper.h
+4
-2
dnn/src/x86/conv_bias/opr_impl.cpp
dnn/src/x86/conv_bias/opr_impl.cpp
+12
-3
dnn/src/x86/conv_bias/opr_impl.h
dnn/src/x86/conv_bias/opr_impl.h
+3
-1
dnn/test/x86/conv_bias.cpp
dnn/test/x86/conv_bias.cpp
+119
-77
未找到文件。
dnn/src/x86/conv_bias/int8/algos.cpp
浏览文件 @
3c32ad6d
...
...
@@ -6,16 +6,18 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/x86/conv_bias/int8/algos.h"
#include "src/common/opr_delegate.h"
#include "src/common/utils.h"
#include "src/fallback/convolution/img2col_helper.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_stride2.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
#include "src/x86/conv_bias/opr_impl.h"
#include "src/x86/conv_bias/postprocess_helper.h"
#include "src/x86/handle.h"
...
...
@@ -38,6 +40,7 @@ bool ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::usable(
auto
&&
fm
=
param
.
filter_meta
;
auto
FH
=
fm
.
spatial
[
0
];
bool
aviliable
=
(
param
.
bias_mode
!=
BiasMode
::
BIAS
)
&&
((
param
.
src_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
)
||
...
...
@@ -61,12 +64,12 @@ WorkspaceBundle ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_bundle(
size_t
IH2
,
IW2
,
OH2
,
OW2
;
size_t
src_size
=
0
,
dst_size
=
0
,
int32_temp
=
0
;
avx2_chanwise_stride1
::
get_rectified_size
(
param
,
IH2
,
IW2
,
OH2
,
OW2
);
get_rectified_size
(
param
,
IH2
,
IW2
,
OH2
,
OW2
);
if
(
avx2_chanwise_stride1
::
need_src_copy
(
param
))
{
if
(
need_src_copy
(
param
))
{
src_size
=
IH2
*
IW2
*
sizeof
(
int8_t
)
*
nr_threads
;
}
if
(
avx2_chanwise_stride1
::
need_dst_copy
(
param
))
{
if
(
need_dst_copy
(
param
))
{
dst_size
=
OH2
*
OW2
*
param
.
dst_type
.
size
()
*
nr_threads
;
}
bool
dst_need_convert
=
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
...
...
@@ -91,6 +94,66 @@ ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::get_kimpls(
return
avx2_chanwise_stride1
::
get_kimpls
(
param
,
bundle
);
}
bool
ConvBiasImpl
::
AlgoChanWiseAvx2Stride2Qint8
::
usable
(
FallbackConvBiasImpl
*
/*opr*/
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
/*algo_selection_strategy*/
)
const
{
auto
&&
fm
=
param
.
filter_meta
;
auto
FH
=
fm
.
spatial
[
0
];
bool
aviliable
=
(
param
.
bias_mode
!=
BiasMode
::
BIAS
)
&&
((
param
.
src_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
)
||
(((
param
.
src_type
.
enumv
()
==
DTypeEnum
::
Int8
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
Int8
&&
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
Int32
)
||
(
param
.
src_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS32
))))
&&
fm
.
format
==
Param
::
Format
::
NCHW
&&
fm
.
spatial_ndim
==
2
&&
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
&&
(
FH
==
2
||
FH
==
3
||
FH
==
5
||
FH
==
7
)
&&
fm
.
stride
[
0
]
==
2
&&
fm
.
stride
[
1
]
==
2
&&
(
fm
.
icpg
==
1
)
&&
(
fm
.
ocpg
==
1
)
&&
is_supported
(
SIMDType
::
AVX2
);
return
aviliable
;
}
WorkspaceBundle
ConvBiasImpl
::
AlgoChanWiseAvx2Stride2Qint8
::
get_bundle
(
const
NCBKernSizeParam
&
param
)
{
size_t
nr_threads
=
param
.
nr_threads
;
size_t
IH2
,
IW2
,
OH2
,
OW2
;
size_t
src_size
=
0
,
dst_size
=
0
,
int32_temp
=
0
;
get_rectified_size
(
param
,
IH2
,
IW2
,
OH2
,
OW2
);
if
(
need_src_copy
(
param
))
{
src_size
=
IH2
*
IW2
*
sizeof
(
int8_t
)
*
nr_threads
;
}
if
(
need_dst_copy
(
param
))
{
dst_size
=
OH2
*
OW2
*
param
.
dst_type
.
size
()
*
nr_threads
;
}
bool
dst_need_convert
=
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
if
(
dst_need_convert
)
{
int32_temp
=
OH2
*
OW2
*
sizeof
(
int32_t
)
*
nr_threads
;
}
return
dst_need_convert
?
WorkspaceBundle
(
nullptr
,
{
src_size
,
dst_size
,
int32_temp
})
:
WorkspaceBundle
(
nullptr
,
{
src_size
,
dst_size
});
}
size_t
ConvBiasImpl
::
AlgoChanWiseAvx2Stride2Qint8
::
get_workspace
(
FallbackConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
)
const
{
return
get_bundle
(
param
).
total_size_in_bytes
();
}
SmallVector
<
fallback
::
ConvBiasImpl
::
NCBKern
>
ConvBiasImpl
::
AlgoChanWiseAvx2Stride2Qint8
::
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
{
auto
bundle
=
get_bundle
(
param
);
return
avx2_chanwise_stride2
::
get_kimpls
(
param
,
bundle
);
}
bool
ConvBiasImpl
::
AlgoDirectAvx2Stride1Int8
::
usable
(
FallbackConvBiasImpl
*
/*opr*/
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
/*algo_selection_strategy*/
)
const
{
...
...
dnn/src/x86/conv_bias/int8/algos.h
浏览文件 @
3c32ad6d
...
...
@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/x86/conv_bias/opr_impl.h"
...
...
@@ -36,6 +37,28 @@ public:
void
*
type
()
const
override
;
};
/* ===================== avx2 stride2 chanwise algo ===================== */
class
ConvBiasImpl
::
AlgoChanWiseAvx2Stride2Qint8
final
:
public
AlgoBase
{
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
;
static
WorkspaceBundle
get_bundle
(
const
NCBKernSizeParam
&
param
);
public:
bool
is_reproducible
()
const
override
{
return
true
;
}
const
char
*
name
()
const
override
{
return
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2"
;
}
bool
usable
(
FallbackConvBiasImpl
*
opr
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
algo_selection_strategy
)
const
override
;
size_t
get_workspace
(
FallbackConvBiasImpl
*
opr
,
const
NCBKernSizeParam
&
param
)
const
override
;
virtual
SmallVector
<
NCBKern
>
dispatch_kerns
(
fallback
::
ConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
)
const
override
{
return
get_kimpls
(
param
);
}
void
*
type
()
const
override
;
};
/* ===================== avx2 stride1 direct algo ===================== */
class
ConvBiasImpl
::
AlgoDirectAvx2Stride1Int8
final
:
public
AlgoBase
{
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
;
...
...
@@ -125,7 +148,7 @@ public:
void
*
type
()
const
override
;
};
#endif
/* ==================
=== avx2 int8 direct conv stride2 algo ===
================== */
/* ==================
avx2 int8 direct conv stride2 algo
================== */
class
ConvBiasImpl
::
AlgoAVX2DirectConvStride2
final
:
public
AlgoBase
{
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
;
static
WorkspaceBundle
get_bundle
(
const
NCBKernSizeParam
&
param
);
...
...
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
浏览文件 @
3c32ad6d
此差异已折叠。
点击以展开。
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
浏览文件 @
3c32ad6d
...
...
@@ -33,6 +33,25 @@ KERN(stride1, 7)
#undef KERN
}
// namespace avx2_chanwise_stride1
namespace
avx2_chanwise_stride2
{
#define KERN(stride, i) \
template <BiasMode bias_mode, bool is_quantized, typename Op> \
MEGDNN_ATTRIBUTE_TARGET("avx2") \
void avx2_chanwise_direct_##stride##_##i##x##i##_int8( \
const int8_t* src, const int8_t* filter, const int32_t* bias, \
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW, \
const size_t OH, const size_t OW, const Op& op);
KERN
(
stride2
,
2
)
KERN
(
stride2
,
3
)
KERN
(
stride2
,
5
)
KERN
(
stride2
,
7
)
#undef KERN
}
// namespace avx2_chanwise_stride2
}
// namespace x86
}
// namespace megdnn
...
...
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
浏览文件 @
3c32ad6d
...
...
@@ -18,57 +18,6 @@ namespace megdnn {
namespace
x86
{
namespace
avx2_chanwise_stride1
{
bool
need_dst_copy
(
const
NCBKernSizeParam
&
param
)
{
return
param
.
osz
[
1
]
%
16
;
}
bool
need_src_copy
(
const
NCBKernSizeParam
&
param
)
{
auto
&&
fm
=
param
.
filter_meta
;
return
(
fm
.
padding
[
0
]
!=
0
||
fm
.
padding
[
1
]
!=
0
)
?
true
:
need_dst_copy
(
param
);
}
void
get_rectified_size
(
const
NCBKernSizeParam
&
param
,
size_t
&
IH2
,
size_t
&
IW2
,
size_t
&
OH2
,
size_t
&
OW2
)
{
auto
&&
fm
=
param
.
filter_meta
;
auto
SW
=
fm
.
stride
[
1
];
auto
OH
=
param
.
osz
[
0
];
auto
OW
=
param
.
osz
[
1
];
auto
FH
=
fm
.
spatial
[
0
];
auto
FW
=
fm
.
spatial
[
1
];
OH2
=
OH
;
OW2
=
(
OW
+
15
)
&
~
15
;
IH2
=
SW
*
OH
+
FH
-
SW
;
IW2
=
SW
*
OW2
+
FW
-
SW
;
}
void
copy_padding_kern
(
WorkspaceBundle
bundle
,
const
ConvBiasImpl
::
NCBKernParam
&
kern_param
,
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
)
{
size_t
IH
=
kern_param
.
isz
[
0
];
size_t
IW
=
kern_param
.
isz
[
1
];
size_t
PH
=
kern_param
.
filter_meta
.
padding
[
0
];
size_t
PW
=
kern_param
.
filter_meta
.
padding
[
1
];
size_t
IH2
,
IW2
,
OH2
,
OW2
;
get_rectified_size
(
kern_param
,
IH2
,
IW2
,
OH2
,
OW2
);
bool
need_src_copy_var
=
need_src_copy
(
kern_param
);
size_t
padding_group_size
=
IH2
*
IW2
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
],
channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
workspace_group_id
=
ncb_index
.
thread_id
;
const
int8_t
*
sptr
=
kern_param
.
src
<
int8_t
>
(
batch_id
,
group_id
,
channel_id
);
if
(
need_src_copy_var
)
{
int8_t
*
sptr_base
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
padding_group_size
;
std
::
memset
(
sptr_base
,
0
,
sizeof
(
int8_t
)
*
IH2
*
IW2
);
rep
(
ih
,
IH
)
{
std
::
memcpy
(
sptr_base
+
(
ih
+
PH
)
*
IW2
+
PW
,
sptr
+
ih
*
IW
,
sizeof
(
int8_t
)
*
IW
);
}
}
};
template
<
size_t
filter
,
BiasMode
bias_mode
,
bool
is_quantized
,
typename
Op
>
void
conv_kimpl
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
...
...
@@ -97,8 +46,7 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
batch_id
=
ncb_index
.
ndrange_id
[
1
];
const
int8_t
*
sptr
=
kern_param
.
src
<
dt_int8
>
(
batch_id
,
group_id
);
const
int8_t
*
fptr
=
kern_param
.
filter
<
dt_int8
>
(
group_id
);
const
int8_t
*
fptr
=
kern_param
.
filter
<
dt_int8
>
(
group_id
);
void
*
dst
=
kern_param
.
dst
<
void
>
(
batch_id
,
group_id
);
const
int32_t
*
bptr
=
kern_param
.
bias
<
dt_int32
>
(
batch_id
,
group_id
);
if
(
need_src_copy_var
)
{
...
...
@@ -130,9 +78,9 @@ void conv_kimpl(WorkspaceBundle bundle, const NCBKernParam& kern_param,
if
(
need_post_process
)
{
tptr
=
static_cast
<
int32_t
*>
(
bundle
.
get
(
2
))
+
ncb_index
.
thread_id
*
OH2
*
OW2
*
kern_param
.
dst_type
.
size
();
DISPATCH_FILTER
(
filter
,
KERN_NEED_POST_PROCESS
)
DISPATCH_FILTER
(
filter
,
KERN_NEED_POST_PROCESS
)
}
else
{
DISPATCH_FILTER
(
filter
,
KERN_NO_POST_PROCESS
)
DISPATCH_FILTER
(
filter
,
KERN_NO_POST_PROCESS
)
}
#undef KERN_NEED_POST_PROCESS
...
...
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
浏览文件 @
3c32ad6d
...
...
@@ -11,27 +11,15 @@
*/
#pragma once
#include "src/x86/conv_bias/int8/common_helper.h"
#include "src/x86/conv_bias/opr_impl.h"
namespace
megdnn
{
namespace
x86
{
namespace
avx2_chanwise_stride1
{
using
NCBKern
=
fallback
::
ConvBiasImpl
::
NCBKern
;
using
NCBKernSizeParam
=
fallback
::
ConvBiasImpl
::
NCBKernSizeParam
;
using
NCBKernParam
=
fallback
::
ConvBiasImpl
::
NCBKernParam
;
using
NCBKernIndex
=
fallback
::
ConvBiasImpl
::
NCBKernIndex
;
using
conv_fun
=
std
::
function
<
void
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
>
;
bool
need_dst_copy
(
const
NCBKernSizeParam
&
param
);
bool
need_src_copy
(
const
NCBKernSizeParam
&
param
);
void
get_rectified_size
(
const
NCBKernSizeParam
&
param
,
size_t
&
IH2
,
size_t
&
IW2
,
size_t
&
OH2
,
size_t
&
OW2
);
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
,
WorkspaceBundle
bundle
);
...
...
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.cpp
0 → 100644
浏览文件 @
3c32ad6d
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride2.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/x86/conv_bias/int8/avx2_chanwise_stride2.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_kern.h"
#include "src/x86/elemwise_op.h"
namespace
megdnn
{
namespace
x86
{
namespace
avx2_chanwise_stride2
{
template
<
size_t
filter
,
BiasMode
bias_mode
,
bool
is_quantized
,
typename
Op
>
void
conv_kimpl
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
size_t
OH
=
kern_param
.
osz
[
0
];
size_t
OW
=
kern_param
.
osz
[
1
];
size_t
IH2
,
IW2
,
OH2
,
OW2
;
get_rectified_size
(
kern_param
,
IH2
,
IW2
,
OH2
,
OW2
);
bool
need_src_copy_var
=
need_src_copy
(
kern_param
);
bool
need_dst_copy_var
=
need_dst_copy
(
kern_param
);
bool
need_post_process
=
kern_param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
Op
op
=
Op
(
1.0
f
,
4.0
f
);
if
(
need_post_process
)
{
float
scale_bias
=
kern_param
.
bias_type
.
param
<
dtype
::
QuantizedS32
>
().
scale
;
float
scale_dst
=
kern_param
.
dst_type
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
op
=
Op
(
scale_bias
,
scale_dst
);
}
size_t
padding_group_size
=
IH2
*
IW2
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
size_t
workspace_group_id
=
ncb_index
.
thread_id
;
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
];
const
int8_t
*
sptr
=
kern_param
.
src
<
dt_int8
>
(
batch_id
,
group_id
);
const
int8_t
*
fptr
=
kern_param
.
filter
<
dt_int8
>
(
group_id
);
void
*
dst
=
kern_param
.
dst
<
void
>
(
batch_id
,
group_id
);
const
int32_t
*
bptr
=
kern_param
.
bias
<
dt_int32
>
(
batch_id
,
group_id
);
if
(
need_src_copy_var
)
{
sptr
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
padding_group_size
;
}
void
*
dptr
=
nullptr
;
int32_t
*
tptr
=
nullptr
;
if
(
need_dst_copy_var
)
{
dptr
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
bundle
.
get
(
1
))
+
ncb_index
.
thread_id
*
OH2
*
OW2
*
kern_param
.
dst_type
.
size
());
}
else
{
dptr
=
dst
;
}
#define KERN_NEED_POST_PROCESS(filter) \
avx2_chanwise_direct_stride2_##filter##x##filter##_int8<bias_mode, true, \
Op>( \
sptr, fptr, bptr, tptr, static_cast<int8_t*>(dptr), IH2, IW2, OH2, \
OW2, op)
#define KERN_NO_POST_PROCESS(filter) \
avx2_chanwise_direct_stride2_##filter##x##filter##_int8<bias_mode, false, \
Op>( \
sptr, fptr, bptr, static_cast<int32_t*>(dptr), nullptr, IH2, IW2, \
OH2, OW2, op)
if
(
need_post_process
)
{
tptr
=
static_cast
<
int32_t
*>
(
bundle
.
get
(
2
))
+
ncb_index
.
thread_id
*
OH2
*
OW2
*
kern_param
.
dst_type
.
size
();
DISPATCH_FILTER
(
filter
,
KERN_NEED_POST_PROCESS
)
}
else
{
DISPATCH_FILTER
(
filter
,
KERN_NO_POST_PROCESS
)
}
#undef KERN_NEED_POST_PROCESS
#undef KERN_NO_POST_PROCESS
if
(
need_dst_copy_var
)
{
rep
(
oh
,
OH
)
{
std
::
memcpy
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
dst
)
+
oh
*
OW
*
kern_param
.
dst_type
.
size
()),
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
dptr
)
+
oh
*
OW2
*
kern_param
.
dst_type
.
size
()),
kern_param
.
dst_type
.
size
()
*
OW
);
}
}
};
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
kern_param
,
WorkspaceBundle
bundle
)
{
MEGDNN_MARK_USED_VAR
(
kern_param
);
auto
fm
=
kern_param
.
filter_meta
;
size_t
group
=
fm
.
group
;
size_t
n
=
kern_param
.
n
;
SmallVector
<
NCBKern
>
ncb_kerns
;
conv_fun
do_conv_fun
=
nullptr
;
#define DO_CONV_KERN_FUN(filter, bias_mode, is_quantized, op) \
do_conv_fun = conv_kimpl<filter, bias_mode, is_quantized, op>;
#define GET_OP_PARAM(i, bias_mode, is_quantized) \
switch (kern_param.nonlineMode) { \
case param::ConvBias::NonlineMode::IDENTITY: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
TypeCvtOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
case param::ConvBias::NonlineMode::RELU: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
ReluOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
case param::ConvBias::NonlineMode::H_SWISH: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
HSwishOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
default: \
megdnn_assert(0, "do not support nonlineMode: %d", \
static_cast<int>(kern_param.nonlineMode)); \
break; \
}
#define GET_BIAS_MODE_PARAM(i, is_quantized) \
switch (kern_param.bias_mode) { \
case BiasMode::NO_BIAS: \
GET_OP_PARAM(i, BiasMode::NO_BIAS, is_quantized) \
break; \
case BiasMode::BROADCAST_CHANNEL_BIAS: \
GET_OP_PARAM(i, BiasMode::BROADCAST_CHANNEL_BIAS, is_quantized) \
break; \
default: \
megdnn_assert(0, "do not support bias mode: %d", \
static_cast<int>(kern_param.bias_mode)); \
break; \
}
#define GET_QUANTIZED(i) \
switch (kern_param.dst_type.enumv()) { \
case DTypeEnum::QuantizedS8: \
GET_BIAS_MODE_PARAM(i, true) \
break; \
case DTypeEnum::QuantizedS32: \
GET_BIAS_MODE_PARAM(i, false) \
break; \
case DTypeEnum::Int32: \
GET_BIAS_MODE_PARAM(i, false) \
break; \
default: \
megdnn_assert(0, "do not support dtype: %d", \
static_cast<int>(kern_param.dst_type.enumv())); \
break; \
}
#define DISPATCH_CONV_KERN() \
switch (kern_param.filter_meta.spatial[0]) { \
case 2: \
GET_QUANTIZED(2) \
break; \
case 3: \
GET_QUANTIZED(3) \
break; \
case 5: \
GET_QUANTIZED(5) \
break; \
case 7: \
GET_QUANTIZED(7) \
break; \
default: \
megdnn_assert( \
0, "do not support kernel: %d", \
static_cast<int>(kern_param.filter_meta.spatial[0])); \
break; \
}
DISPATCH_CONV_KERN
();
auto
exec_one_group
=
[
bundle
,
do_conv_fun
](
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
);
do_conv_fun
(
bundle
,
kern_param
,
ncb_index
);
};
ncb_kerns
.
push_back
({
exec_one_group
,
{
group
,
n
,
1
_z
}});
return
ncb_kerns
;
}
}
// namespace avx2_chanwise_stride2
}
// namespace x86
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride2.h
0 → 100644
浏览文件 @
3c32ad6d
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride2.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/x86/conv_bias/int8/common_helper.h"
#include "src/x86/conv_bias/opr_impl.h"
namespace
megdnn
{
namespace
x86
{
namespace
avx2_chanwise_stride2
{
using
conv_fun
=
std
::
function
<
void
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
>
;
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
,
WorkspaceBundle
bundle
);
}
// namespace avx2_chanwise_stride2
}
// namespace x86
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/x86/conv_bias/int8/chanwise_helper.h
0 → 100644
浏览文件 @
3c32ad6d
/**
* \file dnn/src/x86/conv_bias/int8/chainwise_helper.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "megdnn/arch.h"
#include "src/x86/conv_bias/opr_impl.h"
namespace
megdnn
{
namespace
x86
{
using
NCBKern
=
fallback
::
ConvBiasImpl
::
NCBKern
;
using
NCBKernSizeParam
=
fallback
::
ConvBiasImpl
::
NCBKernSizeParam
;
using
NCBKernParam
=
fallback
::
ConvBiasImpl
::
NCBKernParam
;
using
NCBKernIndex
=
fallback
::
ConvBiasImpl
::
NCBKernIndex
;
static
inline
bool
need_dst_copy
(
const
NCBKernSizeParam
&
param
)
{
return
param
.
osz
[
1
]
%
16
;
}
static
inline
bool
need_src_copy
(
const
NCBKernSizeParam
&
param
)
{
auto
&&
fm
=
param
.
filter_meta
;
return
(
fm
.
padding
[
0
]
!=
0
||
fm
.
padding
[
1
]
!=
0
)
?
true
:
need_dst_copy
(
param
);
}
static
inline
void
get_rectified_size
(
const
NCBKernSizeParam
&
param
,
size_t
&
IH2
,
size_t
&
IW2
,
size_t
&
OH2
,
size_t
&
OW2
)
{
auto
&&
fm
=
param
.
filter_meta
;
auto
SW
=
fm
.
stride
[
1
];
auto
OH
=
param
.
osz
[
0
];
auto
OW
=
param
.
osz
[
1
];
auto
FH
=
fm
.
spatial
[
0
];
auto
FW
=
fm
.
spatial
[
1
];
OH2
=
OH
;
OW2
=
(
OW
+
15
)
&
~
15
;
IH2
=
SW
*
OH
+
FH
-
SW
;
IW2
=
SW
*
OW2
+
FW
-
SW
;
}
static
inline
void
copy_padding_kern
(
WorkspaceBundle
bundle
,
const
ConvBiasImpl
::
NCBKernParam
&
kern_param
,
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
)
{
size_t
IW
=
kern_param
.
isz
[
1
];
size_t
IH
=
kern_param
.
isz
[
0
];
size_t
PH
=
kern_param
.
filter_meta
.
padding
[
0
];
size_t
PW
=
kern_param
.
filter_meta
.
padding
[
1
];
size_t
IH2
,
IW2
,
OH2
,
OW2
;
get_rectified_size
(
kern_param
,
IH2
,
IW2
,
OH2
,
OW2
);
bool
need_src_copy_var
=
need_src_copy
(
kern_param
);
size_t
padding_group_size
=
IH2
*
IW2
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
],
channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
workspace_group_id
=
ncb_index
.
thread_id
;
const
int8_t
*
sptr
=
kern_param
.
src
<
int8_t
>
(
batch_id
,
group_id
,
channel_id
);
if
(
need_src_copy_var
)
{
int8_t
*
sptr_base
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
padding_group_size
;
std
::
memset
(
sptr_base
,
0
,
sizeof
(
int8_t
)
*
IH2
*
IW2
);
rep
(
ih
,
std
::
min
(
IH
,
IH2
))
{
std
::
memcpy
(
sptr_base
+
(
ih
+
PH
)
*
IW2
+
PW
,
sptr
+
ih
*
IW
,
sizeof
(
int8_t
)
*
IW
);
}
}
};
}
// namespace x86
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/x86/conv_bias/int8/common_helper.h
浏览文件 @
3c32ad6d
...
...
@@ -6,13 +6,15 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <immintrin.h>
#include "src/common/unroll_macro.h"
#include "megdnn/arch.h"
#include "src/common/unroll_macro.h"
#include "src/x86/conv_bias/int8/chanwise_helper.h"
#ifdef WIN32
#include <smmintrin.h>
#endif
...
...
dnn/src/x86/conv_bias/opr_impl.cpp
浏览文件 @
3c32ad6d
...
...
@@ -6,17 +6,18 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/x86/conv_bias/opr_impl.h"
#include <algorithm>
#include <memory>
#include "src/x86/matrix_mul/opr_impl.h"
#include "src/common/metahelper.h"
#include "src/common/opr_delegate.h"
#include "src/x86/conv_bias/f32/algos.h"
#include "src/x86/conv_bias/int8/algos.h"
#include "src/x86/matrix_mul/opr_impl.h"
using
namespace
megdnn
;
using
namespace
x86
;
...
...
@@ -69,6 +70,10 @@ void* ConvBiasImpl::AlgoChanWiseAvx2Stride1Qint8::type() const {
return
x86_algo_type
;
}
void
*
ConvBiasImpl
::
AlgoChanWiseAvx2Stride2Qint8
::
type
()
const
{
return
x86_algo_type
;
}
class
ConvBiasImpl
::
AlgoPack
:
NonCopyableObj
{
AlgoDirect
stride1_direct_large_group
{
true
};
AlgoDirect
stride1_direct_small_group
{
false
};
...
...
@@ -77,6 +82,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoDirectAvx2Stride1Int8
avx2_stride1_direct_int8
;
AlgoAVX2DirectConvStride2
avx2_stride2_direct
;
AlgoChanWiseAvx2Stride1Qint8
avx2_stride1_chanwsie_qint8
;
AlgoChanWiseAvx2Stride2Qint8
avx2_stride2_chanwsie_qint8
;
AlgoMatrixMul
matmul
;
#if MEGDNN_X86_WITH_MKL_DNN
AlgoMkldnnMatmulQint8
mkldnn_matmul_qint8
;
...
...
@@ -85,6 +91,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoMkldnnConv
mkldnn_conv_fp32
;
#endif
SmallVector
<
std
::
unique_ptr
<
AlgoBase
>>
refhold
;
public:
AlgoPack
()
{
#if MEGDNN_X86_WITH_MKL_DNN
...
...
@@ -100,6 +107,7 @@ public:
all_algos
.
emplace_back
(
&
avx2_stride1_direct_int8
);
all_algos
.
emplace_back
(
&
avx2_stride2_direct
);
all_algos
.
emplace_back
(
&
avx2_stride1_chanwsie_qint8
);
all_algos
.
emplace_back
(
&
avx2_stride2_chanwsie_qint8
);
all_algos
.
emplace_back
(
&
matmul
);
static
CpuOprDelegationStorage
<>
storage
;
...
...
@@ -107,7 +115,8 @@ public:
auto
&&
matmul_algos
=
static_cast
<
MatrixMulImpl
*>
(
matmul_opr
)
->
algo_pack
();
for
(
auto
&&
algo
:
matmul_algos
)
{
if
(
algo
->
type
()
==
nullptr
)
continue
;
if
(
algo
->
type
()
==
nullptr
)
continue
;
for
(
uint32_t
tile_size
:
{
8
,
16
,
24
})
{
refhold
.
emplace_back
(
new
AlgoFP32WinogradF63_8x8
(
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
...
...
dnn/src/x86/conv_bias/opr_impl.h
浏览文件 @
3c32ad6d
...
...
@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
...
...
@@ -32,6 +33,7 @@ public:
class
AlgoDirectAvx2Stride1Int8
;
class
AlgoAVX2DirectConvStride2
;
class
AlgoChanWiseAvx2Stride1Qint8
;
class
AlgoChanWiseAvx2Stride2Qint8
;
#if MEGDNN_X86_WITH_MKL_DNN
class
AlgoMkldnnConv
;
class
AlgoMkldnnQint8
;
...
...
dnn/test/x86/conv_bias.cpp
浏览文件 @
3c32ad6d
...
...
@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/x86/utils.h"
#include "test/x86/fixture.h"
...
...
@@ -41,7 +42,8 @@ TEST_F(X86, CONV_BIAS_FORWARD) {
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32
)
{
static
void
avx2_chanwise_direct_int8x8x32
(
Handle
*
handle
,
uint32_t
stride
,
const
char
*
algo
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
...
...
@@ -50,8 +52,8 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
if
(
w
+
2
*
p
<
kernel
||
h
+
2
*
p
<
kernel
)
return
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
stride_h
=
stride
;
param
.
stride_w
=
stride
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
nonlineMode
=
nonline_mode
;
...
...
@@ -74,7 +76,7 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
for
(
NonlineMode
nonline_mode
:
{
NonlineMode
::
IDENTITY
})
run
(
ic
,
w
,
h
,
kernel
,
pad
,
nonline_mode
);
Checker
<
ConvBias
>
checker
(
handle
()
);
Checker
<
ConvBias
>
checker
(
handle
);
UniformIntRNG
rng
{
-
50
,
50
};
checker
.
set_dtype
(
0
,
dtype
::
Int8
())
.
set_dtype
(
1
,
dtype
::
Int8
())
...
...
@@ -85,15 +87,25 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32) {
.
set_rng
(
2
,
&
rng
)
.
set_epsilon
(
1e-3
);
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
));
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
algo
));
for
(
auto
&&
arg
:
args
)
{
checker
.
set_param
(
arg
.
param
).
exec
(
{
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32
)
{
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32
)
{
avx2_chanwise_direct_int8x8x32
(
handle
(),
1
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
);
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE2_INT8x8x32
)
{
avx2_chanwise_direct_int8x8x32
(
handle
(),
2
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2"
);
}
static
void
avx2_chanwise_direct_quantizeds32
(
Handle
*
handle
,
uint32_t
stride
,
const
char
*
algo
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
...
...
@@ -102,8 +114,8 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
if
(
w
+
2
*
p
<
kernel
||
h
+
2
*
p
<
kernel
)
return
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
stride_h
=
stride
;
param
.
stride_w
=
stride
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
nonlineMode
=
nonline_mode
;
...
...
@@ -126,7 +138,7 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
for
(
NonlineMode
nonline_mode
:
{
NonlineMode
::
IDENTITY
})
run
(
ic
,
w
,
h
,
kernel
,
pad
,
nonline_mode
);
Checker
<
ConvBias
>
checker
(
handle
()
);
Checker
<
ConvBias
>
checker
(
handle
);
UniformIntRNG
rng
{
-
50
,
50
};
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
2.5
f
))
...
...
@@ -137,15 +149,26 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32) {
.
set_rng
(
2
,
&
rng
)
.
set_epsilon
(
1e-3
);
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
));
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
algo
));
for
(
auto
&&
arg
:
args
)
{
checker
.
set_param
(
arg
.
param
).
exec
(
{
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8
)
{
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32
)
{
avx2_chanwise_direct_quantizeds32
(
handle
(),
1
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
);
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE2_QuantizedS32
)
{
avx2_chanwise_direct_quantizeds32
(
handle
(),
2
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2"
);
}
static
void
avx2_chanwise_direct_quantizeds8x8x8
(
Handle
*
handle
,
uint32_t
stride
,
const
char
*
algo
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
...
...
@@ -154,8 +177,8 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
if
(
w
+
2
*
p
<
kernel
||
h
+
2
*
p
<
kernel
)
return
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
stride_h
=
stride
;
param
.
stride_w
=
stride
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
nonlineMode
=
nonline_mode
;
...
...
@@ -180,7 +203,7 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
NonlineMode
::
RELU
})
run
(
ic
,
w
,
h
,
kernel
,
pad
,
nonline_mode
);
Checker
<
ConvBias
>
checker
(
handle
()
);
Checker
<
ConvBias
>
checker
(
handle
);
UniformIntRNG
rng
{
-
50
,
50
};
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
2.5
f
))
...
...
@@ -191,14 +214,23 @@ TEST_F(X86_MULTI_THREADS, AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8) {
.
set_rng
(
2
,
&
rng
)
.
set_epsilon
(
1e-3
);
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
));
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
algo
));
for
(
auto
&&
arg
:
args
)
{
checker
.
set_param
(
arg
.
param
).
exec
(
{
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8
)
{
avx2_chanwise_direct_quantizeds8x8x8
(
handle
(),
1
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
);
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE2_QuantizedS8x8x8
)
{
avx2_chanwise_direct_quantizeds8x8x8
(
handle
(),
2
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2"
);
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
...
...
@@ -343,7 +375,6 @@ TEST_F(X86_MULTI_THREADS, AVX2_CONV_BIAS_DIRECT_STRIDE1_S8S8S8) {
args
.
emplace_back
(
param
,
TensorShape
{
2
,
2
*
ic
,
h
,
w
},
TensorShape
{
2
,
oc
/
2
,
ic
,
kernel
,
kernel
},
TensorShape
{
1
,
oc
,
1
,
1
});
};
for
(
size_t
kernel
:
{
2
,
3
,
5
,
7
})
...
...
@@ -967,8 +998,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
#if MEGDNN_X86_WITH_MKL_DNN
if
(
x86
::
is_supported
(
x86
::
SIMDType
::
VNNI
))
{
checker_conv_bias
(
args
,
handle
(),
&
rng
,
epsilon
,
dtype
::
Int8
{},
dtype
::
Int8
{},
dtype
::
Int32
{},
dtype
::
Int32
{},
"CONV1x1:X86_INT8X8X32_MKLDNN:24"
);
dtype
::
Int8
{},
dtype
::
Int32
{},
dtype
::
Int32
{},
"CONV1x1:X86_INT8X8X32_MKLDNN:24"
);
}
#endif
#if MEGDNN_X86_WITH_VNNI
...
...
@@ -983,8 +1014,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) {
dtype
::
Int8
{},
dtype
::
Int32
{},
dtype
::
Int32
{},
"CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24"
);
checker_conv_bias
(
args
,
handle
(),
&
rng
,
epsilon
,
dtype
::
Int8
{},
dtype
::
Int8
{},
dtype
::
Int32
{},
dtype
::
Int32
{},
"CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24"
);
dtype
::
Int8
{},
dtype
::
Int32
{},
dtype
::
Int32
{},
"CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24"
);
}
checker_conv_bias
(
args
,
handle
(),
&
rng
,
epsilon
,
dtype
::
Int8
{},
dtype
::
Int8
{},
dtype
::
Int32
{},
dtype
::
Int32
{},
...
...
@@ -1231,7 +1262,7 @@ TEST_F(X86_MULTI_THREADS, BENCHMARK_CONVBIAS_FP32_MKLDNN) {
#endif
/************************* Winograd ****************************/
namespace
{
namespace
{
std
::
vector
<
conv_bias
::
TestArg
>
get_winograd_mk_nchw88_args
()
{
std
::
vector
<
conv_bias
::
TestArg
>
args
;
param
::
ConvBias
cur_param
;
...
...
@@ -1265,17 +1296,17 @@ std::vector<conv_bias::TestArg> get_winograd_mk_nchw88_args() {
TensorShape{2, oc, ic, 3, 3, 8, 8},
TensorShape{1, 2 * oc, 1, 1, 8});*/
}}}
// clang-format on
//! test for multi-thread OC parallel
cur_param
.
sparse
=
param
::
ConvBias
::
Sparse
::
DENSE
;
cur_param
.
pad_h
=
cur_param
.
pad_w
=
1
;
args
.
emplace_back
(
cur_param
,
TensorShape
{
2
,
1
,
9
,
9
,
8
},
TensorShape
{
128
,
1
,
3
,
3
,
8
,
8
},
TensorShape
{
1
,
128
,
1
,
1
,
8
});
/*cur_param.sparse = param::ConvBias::Sparse::GROUP;
args.emplace_back(cur_param, TensorShape{2, 2, 9, 9, 8},
TensorShape{2, 128, 1, 3, 3, 8, 8},
TensorShape{1, 2 * 128, 1, 1, 8});*/
// clang-format on
//! test for multi-thread OC parallel
cur_param
.
sparse
=
param
::
ConvBias
::
Sparse
::
DENSE
;
cur_param
.
pad_h
=
cur_param
.
pad_w
=
1
;
args
.
emplace_back
(
cur_param
,
TensorShape
{
2
,
1
,
9
,
9
,
8
},
TensorShape
{
128
,
1
,
3
,
3
,
8
,
8
},
TensorShape
{
1
,
128
,
1
,
1
,
8
});
/*cur_param.sparse = param::ConvBias::Sparse::GROUP;
args.emplace_back(cur_param, TensorShape{2, 2, 9, 9, 8},
TensorShape{2, 128, 1, 3, 3, 8, 8},
TensorShape{1, 2 * 128, 1, 1, 8});*/
}
return
args
;
}
...
...
@@ -1329,7 +1360,8 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_WINOGRAD_WEIGHT_PREPROCESS) {
auto
conv_bias_opr
=
handle
->
create_operator
<
ConvBias
>
();
conv_bias_opr
->
param
()
=
param
;
conv_bias_opr
->
param
().
format
=
param
::
ConvBias
::
Format
::
NCHW88_WINOGRAD
;
conv_bias_opr
->
param
().
format
=
param
::
ConvBias
::
Format
::
NCHW88_WINOGRAD
;
conv_bias_opr
->
param
().
output_block_size
=
m
;
size_t
conv_bias_workspace_in_bytes
=
conv_bias_opr
->
get_workspace_in_bytes
(
...
...
@@ -1720,17 +1752,16 @@ void benchmark_impl(const param::ConvBias param,
}
}
void
benchmark_impl_comp
(
const
param
::
ConvBias
param
,
std
::
vector
<
std
::
pair
<
SmallVector
<
TensorShape
>
,
float
>>&
shapes_and_computation
,
const
std
::
string
algo_name
,
const
std
::
string
algo_name1
,
size_t
RUNS
,
TaskExecutorConfig
&&
multi_thread_config
,
TaskExecutorConfig
&&
single_thread_config
,
std
::
vector
<
DType
>
dtype_v
)
{
void
benchmark_impl_comp
(
const
param
::
ConvBias
param
,
std
::
vector
<
std
::
pair
<
SmallVector
<
TensorShape
>
,
float
>>&
shapes_and_computation
,
const
std
::
string
algo_name
,
const
std
::
string
algo_name1
,
size_t
RUNS
,
TaskExecutorConfig
&&
multi_thread_config
,
TaskExecutorConfig
&&
single_thread_config
,
std
::
vector
<
DType
>
dtype_v
)
{
std
::
vector
<
DType
>
data_type
=
{
dtype
::
Float32
(),
dtype
::
Float32
(),
dtype
::
Float32
(),
dtype
::
Float32
()};
std
::
vector
<
float
>
multi_thread_times
,
single_thread_times
;
{
auto
multi_thread_hanle
=
...
...
@@ -1738,10 +1769,10 @@ void benchmark_impl_comp(const param::ConvBias param,
auto
benchmarker
=
Benchmarker
<
ConvBias
>
(
multi_thread_hanle
.
get
());
benchmarker
.
set_times
(
RUNS
)
.
set_display
(
false
)
.
set_dtype
(
0
,
dtype_v
[
0
])
.
set_dtype
(
1
,
dtype_v
[
1
])
.
set_dtype
(
2
,
dtype_v
[
2
])
.
set_dtype
(
4
,
dtype_v
[
3
])
.
set_dtype
(
0
,
dtype_v
[
0
])
.
set_dtype
(
1
,
dtype_v
[
1
])
.
set_dtype
(
2
,
dtype_v
[
2
])
.
set_dtype
(
4
,
dtype_v
[
3
])
.
set_param
(
param
)
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBias
>
(
...
...
@@ -1756,10 +1787,10 @@ void benchmark_impl_comp(const param::ConvBias param,
auto
benchmarker
=
Benchmarker
<
ConvBias
>
(
single_thread_handle
.
get
());
benchmarker
.
set_times
(
RUNS
)
.
set_display
(
false
)
.
set_dtype
(
0
,
dtype_v
[
0
])
.
set_dtype
(
1
,
dtype_v
[
1
])
.
set_dtype
(
2
,
dtype_v
[
2
])
.
set_dtype
(
4
,
dtype_v
[
3
])
.
set_dtype
(
0
,
dtype_v
[
0
])
.
set_dtype
(
1
,
dtype_v
[
1
])
.
set_dtype
(
2
,
dtype_v
[
2
])
.
set_dtype
(
4
,
dtype_v
[
3
])
.
set_param
(
param
)
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBias
>
(
...
...
@@ -1789,11 +1820,13 @@ void benchmark_impl_comp(const param::ConvBias param,
}
}
// namespace
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8
)
{
static
void
benchmark_convbias_chanwise_avx2_int8
(
uint32_t
stride
,
const
char
*
algo
)
{
constexpr
size_t
RUNS
=
50
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
stride_h
=
stride
;
param
.
stride_w
=
stride
;
param
.
sparse
=
param
::
ConvBias
::
Sparse
::
GROUP
;
std
::
vector
<
DType
>
data_type
=
{
dtype
::
Int8
(),
dtype
::
Int8
(),
...
...
@@ -1841,14 +1874,23 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8) {
bench_case
(
1
,
576
,
14
,
14
,
2
);
bench_case
(
1
,
960
,
7
,
7
,
2
);
std
::
string
algo_name
=
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
;
printf
(
"Benchmark
X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1
\n
"
);
std
::
string
algo_name
=
algo
;
printf
(
"Benchmark
%s
\n
"
,
algo
);
benchmark_impl
(
param
,
shapes_and_computation
,
algo_name
,
RUNS
,
{
4
,
{
4
,
5
,
6
,
7
}},
{
1
,
{
4
}},
data_type
);
benchmark_impl
(
param
,
shapes_and_computation
,
algo_name
,
RUNS
,
{
2
,
{
4
,
5
}},
{
1
,
{
4
}},
data_type
);
shapes_and_computation
.
clear
();
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8_S1
)
{
benchmark_convbias_chanwise_avx2_int8
(
1
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
);
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8_S2
)
{
benchmark_convbias_chanwise_avx2_int8
(
2
,
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE2"
);
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8
)
{
constexpr
size_t
RUNS
=
50
;
...
...
@@ -2129,7 +2171,8 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32) {
shapes_and_computation
.
clear
();
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_IM2COL_F32_single_thread
)
{
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_IM2COL_F32_single_thread
)
{
constexpr
size_t
RUNS
=
50
;
param
::
ConvBias
param
;
...
...
@@ -2143,9 +2186,8 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread)
dtype
::
Float32
(),
dtype
::
Float32
()};
std
::
vector
<
std
::
pair
<
SmallVector
<
TensorShape
>
,
float
>>
shapes_and_computation
;
auto
bench_case
=
[
&
](
size_t
N
,
size_t
IC
,
size_t
OC
,
size_t
H
,
size_t
W
,
size_t
FS
,
size_t
group
)
{
auto
bench_case
=
[
&
](
size_t
N
,
size_t
IC
,
size_t
OC
,
size_t
H
,
size_t
W
,
size_t
FS
,
size_t
group
)
{
SmallVector
<
TensorShape
>
shapes
{{
N
,
IC
,
H
,
W
},
{
OC
/
group
,
IC
/
group
,
FS
,
FS
},
{
1
,
OC
,
1
,
1
},
...
...
@@ -2167,7 +2209,7 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread)
bench_case
(
1
,
32
,
32
,
100
,
100
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
80
,
80
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
80
,
80
,
3
,
1
);
bench_case
(
1
,
64
,
32
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
64
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
128
,
7
,
7
,
3
,
1
);
...
...
@@ -2192,10 +2234,10 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread)
std
::
string
algo_name
=
"IM2COLMATMUL:X86_F32_MKL_PACKA:192"
;
std
::
string
algo_name1
=
"IM2COLMATMUL:X86_F32_BLAS:192"
;
printf
(
"Benchmark IM2COLMATMUL:X86_F32_BLAS algo
\n
"
);
benchmark_impl_comp
(
param
,
shapes_and_computation
,
algo_name
,
algo_name1
,
RUNS
,
{
1
,
{
4
}},
{
1
,
{
4
}},
data_type
);
benchmark_impl_comp
(
param
,
shapes_and_computation
,
algo_name
,
algo_name1
,
RUNS
,
{
1
,
{
7
}},
{
1
,
{
7
}},
data_type
);
benchmark_impl_comp
(
param
,
shapes_and_computation
,
algo_name
,
algo_name1
,
RUNS
,
{
1
,
{
4
}},
{
1
,
{
4
}},
data_type
);
benchmark_impl_comp
(
param
,
shapes_and_computation
,
algo_name
,
algo_name1
,
RUNS
,
{
1
,
{
7
}},
{
1
,
{
7
}},
data_type
);
shapes_and_computation
.
clear
();
}
...
...
@@ -2269,7 +2311,7 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_INT8X8X32) {
shapes_and_computation
.
clear
();
}
namespace
{
namespace
{
std
::
vector
<
conv_bias
::
TestArg
>
get_winograd_benchmark_args
(
size_t
kernel
,
size_t
pack_size
)
{
std
::
vector
<
conv_bias
::
TestArg
>
args
;
...
...
@@ -2290,14 +2332,14 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(size_t kernel,
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
args
.
push_back
(
conv_bias
::
TestArg
{
param
,
TensorShape
{
1
,
ic
/
8
,
h
,
w
,
8
}
,
TensorShape
{
oc
/
8
,
ic
/
8
,
kernel
,
kernel
,
8
,
8
},
{
1
,
oc
/
8
,
1
,
1
,
8
}});
args
.
push_back
(
conv_bias
::
TestArg
{
param
,
TensorShape
{
1
,
ic
/
8
,
h
,
w
,
8
},
TensorShape
{
oc
/
8
,
ic
/
8
,
kernel
,
kernel
,
8
,
8
},
{
1
,
oc
/
8
,
1
,
1
,
8
}});
};
for
(
size_t
ic
:
{
64
,
128
,
256
})
{
for
(
size_t
oc
:
{
64
,
128
,
256
})
{
for
(
size_t
oc
:
{
64
,
128
,
256
})
{
pack
(
oc
,
ic
,
56
,
56
,
kernel
,
kernel
/
2
);
pack
(
oc
,
ic
,
14
,
14
,
kernel
,
kernel
/
2
);
pack
(
oc
,
ic
,
28
,
28
,
kernel
,
kernel
/
2
);
...
...
@@ -2317,8 +2359,8 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(size_t kernel,
return
args
;
}
void
benchmark_winograd
(
const
char
*
algo_name
,
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
)
{
void
benchmark_winograd
(
const
char
*
algo_name
,
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
)
{
auto
&&
args
=
get_winograd_benchmark_args
(
kernel
,
pack_size
);
using
namespace
conv_bias
;
constexpr
size_t
RUN
=
10
;
...
...
@@ -2361,7 +2403,7 @@ void benchmark_winograd(const char* algo_name, Handle* handle,
computations
/
used_winograd
,
used
/
used_winograd
);
}
}
}
}
// namespace
TEST_F
(
X86
,
BENCHMARK_CONVBIAS_WINOGRAD_F63_8x8
)
{
benchmark_winograd
(
"WINOGRAD:X86_F32MK8_8X8:8:6:8"
,
handle
(),
3
,
8
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录