Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
90ca8554
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
90ca8554
编写于
4月 09, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/x86): add avx2 int8 stride1 chanwise multithread conv
GitOrigin-RevId: 8f310c3d139dfc27a4083f354597363681b73ba5
上级
0bdb64c5
变更
14
展开全部
隐藏空白更改
内联
并排
Showing
14 changed file
with
2271 addition
and
2 deletion
+2271
-2
dnn/src/common/unroll_macro.h
dnn/src/common/unroll_macro.h
+9
-0
dnn/src/x86/conv_bias/int8/algos.cpp
dnn/src/x86/conv_bias/int8/algos.cpp
+60
-0
dnn/src/x86/conv_bias/int8/algos.h
dnn/src/x86/conv_bias/int8/algos.h
+23
-0
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
+1593
-0
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
+39
-0
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
+251
-0
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
+42
-0
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
+0
-1
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
+0
-1
dnn/src/x86/conv_bias/int8/common_helper.h
dnn/src/x86/conv_bias/int8/common_helper.h
+1
-0
dnn/src/x86/conv_bias/opr_impl.cpp
dnn/src/x86/conv_bias/opr_impl.cpp
+6
-0
dnn/src/x86/conv_bias/opr_impl.h
dnn/src/x86/conv_bias/opr_impl.h
+1
-0
dnn/src/x86/elemwise_helper/kimpl/typecvt.h
dnn/src/x86/elemwise_helper/kimpl/typecvt.h
+26
-0
dnn/test/x86/conv_bias.cpp
dnn/test/x86/conv_bias.cpp
+220
-0
未找到文件。
dnn/src/common/unroll_macro.h
浏览文件 @
90ca8554
...
...
@@ -40,6 +40,15 @@
UNROLL_RAW16(cb, v0, ##a) \
cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \
cb(22, ##a) cb(23, ##a)
#define UNROLL_RAW25(cb, v0, a...) \
UNROLL_RAW24(cb, v0, ##a) \
cb(24, ##a)
#define UNROLL_RAW49(cb, v0, a...) \
UNROLL_RAW25(cb, v0, ##a) \
cb(25, ##a) cb(26, ##a) cb(27, ##a) cb(28, ##a) cb(29, ##a) cb(30, ##a) \
cb(31, ##a) cb(32, ##a) cb(33, ##a) cb(34, ##a) cb(35, ##a) cb(36, ##a) \
cb(37, ##a) cb(38, ##a) cb(39, ##a) cb(40, ##a) cb(41, ##a) cb(42, ##a) \
cb(43, ##a) cb(44, ##a) cb(45, ##a) cb(46, ##a) cb(47, ##a) cb(48, ##a)
#define UNROLL_CALL0(step, cb, v...) UNROLL_RAW##step(cb, 0, ##v)
#define UNROLL_CALL1(step, cb, v...) UNROLL_CALL0(step, cb, ##v)
...
...
dnn/src/x86/conv_bias/int8/algos.cpp
浏览文件 @
90ca8554
...
...
@@ -15,6 +15,7 @@
#include "src/fallback/convolution/img2col_helper.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
#include "src/x86/conv_bias/opr_impl.h"
#include "src/x86/conv_bias/postprocess_helper.h"
#include "src/x86/handle.h"
...
...
@@ -31,6 +32,65 @@ using namespace dnnl;
using
namespace
megdnn
;
using
namespace
x86
;
bool
ConvBiasImpl
::
AlgoChanWiseAvx2Stride1Qint8
::
usable
(
FallbackConvBiasImpl
*
/*opr*/
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
/*algo_selection_strategy*/
)
const
{
auto
&&
fm
=
param
.
filter_meta
;
auto
FH
=
fm
.
spatial
[
0
];
bool
aviliable
=
((
param
.
src_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
)
||
(((
param
.
src_type
.
enumv
()
==
DTypeEnum
::
Int8
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
Int8
&&
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
Int32
)
||
(
param
.
src_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
filter_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
&&
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS32
))))
&&
fm
.
format
==
Param
::
Format
::
NCHW
&&
fm
.
spatial_ndim
==
2
&&
fm
.
dilation
[
0
]
==
1
&&
fm
.
dilation
[
1
]
==
1
&&
(
FH
==
2
||
FH
==
3
||
FH
==
5
||
FH
==
7
)
&&
fm
.
stride
[
0
]
==
1
&&
fm
.
stride
[
1
]
==
1
&&
(
fm
.
icpg
==
1
)
&&
(
fm
.
ocpg
==
1
)
&&
is_supported
(
SIMDType
::
AVX2
);
return
aviliable
;
}
WorkspaceBundle
ConvBiasImpl
::
AlgoChanWiseAvx2Stride1Qint8
::
get_bundle
(
const
NCBKernSizeParam
&
param
)
{
size_t
nr_threads
=
param
.
nr_threads
;
size_t
IH2
,
IW2
,
OH2
,
OW2
;
size_t
src_size
=
0
,
dst_size
=
0
,
int32_temp
=
0
;
avx2_chanwise_stride1
::
get_rectified_size
(
param
,
IH2
,
IW2
,
OH2
,
OW2
);
if
(
avx2_chanwise_stride1
::
need_src_copy
(
param
))
{
src_size
=
IH2
*
IW2
*
sizeof
(
int8_t
)
*
nr_threads
;
}
if
(
avx2_chanwise_stride1
::
need_dst_copy
(
param
))
{
dst_size
=
OH2
*
OW2
*
param
.
dst_type
.
size
()
*
nr_threads
;
}
bool
dst_need_convert
=
param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
if
(
dst_need_convert
)
{
int32_temp
=
OH2
*
OW2
*
sizeof
(
int32_t
)
*
nr_threads
;
}
return
dst_need_convert
?
WorkspaceBundle
(
nullptr
,
{
src_size
,
dst_size
,
int32_temp
})
:
WorkspaceBundle
(
nullptr
,
{
src_size
,
dst_size
});
}
size_t
ConvBiasImpl
::
AlgoChanWiseAvx2Stride1Qint8
::
get_workspace
(
FallbackConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
)
const
{
return
get_bundle
(
param
).
total_size_in_bytes
();
}
SmallVector
<
fallback
::
ConvBiasImpl
::
NCBKern
>
ConvBiasImpl
::
AlgoChanWiseAvx2Stride1Qint8
::
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
{
auto
bundle
=
get_bundle
(
param
);
return
avx2_chanwise_stride1
::
get_kimpls
(
param
,
bundle
);
}
bool
ConvBiasImpl
::
AlgoDirectAvx2Stride1Int8
::
usable
(
FallbackConvBiasImpl
*
/*opr*/
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
/*algo_selection_strategy*/
)
const
{
...
...
dnn/src/x86/conv_bias/int8/algos.h
浏览文件 @
90ca8554
...
...
@@ -13,6 +13,29 @@
namespace
megdnn
{
namespace
x86
{
/* ===================== avx2 stride1 chanwise algo ===================== */
class
ConvBiasImpl
::
AlgoChanWiseAvx2Stride1Qint8
final
:
public
AlgoBase
{
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
;
static
WorkspaceBundle
get_bundle
(
const
NCBKernSizeParam
&
param
);
public:
bool
is_reproducible
()
const
override
{
return
true
;
}
const
char
*
name
()
const
override
{
return
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
;
}
bool
usable
(
FallbackConvBiasImpl
*
opr
,
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
algo_selection_strategy
)
const
override
;
size_t
get_workspace
(
FallbackConvBiasImpl
*
opr
,
const
NCBKernSizeParam
&
param
)
const
override
;
virtual
SmallVector
<
NCBKern
>
dispatch_kerns
(
fallback
::
ConvBiasImpl
*
,
const
NCBKernSizeParam
&
param
)
const
override
{
return
get_kimpls
(
param
);
}
void
*
type
()
const
override
;
};
/* ===================== avx2 stride1 direct algo ===================== */
class
ConvBiasImpl
::
AlgoDirectAvx2Stride1Int8
final
:
public
AlgoBase
{
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
)
const
;
...
...
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.cpp
0 → 100644
浏览文件 @
90ca8554
此差异已折叠。
点击以展开。
dnn/src/x86/conv_bias/int8/avx2_chanwise_kern.h
0 → 100644
浏览文件 @
90ca8554
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_kern.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/x86/conv_bias/opr_impl.h"
namespace
megdnn
{
namespace
x86
{
namespace
avx2_chanwise_stride1
{
#define KERN(stride, i) \
template <BiasMode bias_mode, bool is_quantized, typename Op> \
MEGDNN_ATTRIBUTE_TARGET("avx2") \
void avx2_chanwise_direct_##stride##_##i##x##i##_int8( \
const int8_t* src, const int8_t* filter, const int32_t* bias, \
int32_t* temp, int8_t* dst, const size_t IH, const size_t IW, \
const size_t OH, const size_t OW, const Op& op);
KERN
(
stride1
,
2
)
KERN
(
stride1
,
3
)
KERN
(
stride1
,
5
)
KERN
(
stride1
,
7
)
#undef KERN
}
// namespace avx2_chanwise_stride1
}
// namespace x86
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.cpp
0 → 100644
浏览文件 @
90ca8554
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/x86/conv_bias/int8/avx2_chanwise_stride1.h"
#include "src/x86/conv_bias/int8/avx2_chanwise_kern.h"
#include "src/x86/elemwise_op.h"
namespace
megdnn
{
namespace
x86
{
namespace
avx2_chanwise_stride1
{
bool
need_dst_copy
(
const
NCBKernSizeParam
&
param
)
{
return
param
.
osz
[
1
]
%
16
;
}
bool
need_src_copy
(
const
NCBKernSizeParam
&
param
)
{
auto
&&
fm
=
param
.
filter_meta
;
return
(
fm
.
padding
[
0
]
!=
0
||
fm
.
padding
[
1
]
!=
0
)
?
true
:
need_dst_copy
(
param
);
}
void
get_rectified_size
(
const
NCBKernSizeParam
&
param
,
size_t
&
IH2
,
size_t
&
IW2
,
size_t
&
OH2
,
size_t
&
OW2
)
{
auto
&&
fm
=
param
.
filter_meta
;
auto
SW
=
fm
.
stride
[
1
];
auto
OH
=
param
.
osz
[
0
];
auto
OW
=
param
.
osz
[
1
];
auto
FH
=
fm
.
spatial
[
0
];
auto
FW
=
fm
.
spatial
[
1
];
OH2
=
OH
;
OW2
=
(
OW
+
15
)
&
~
15
;
IH2
=
SW
*
OH
+
FH
-
SW
;
IW2
=
SW
*
OW2
+
FW
-
SW
;
}
void
copy_padding_kern
(
WorkspaceBundle
bundle
,
const
ConvBiasImpl
::
NCBKernParam
&
kern_param
,
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
)
{
size_t
IH
=
kern_param
.
isz
[
0
];
size_t
IW
=
kern_param
.
isz
[
1
];
size_t
PH
=
kern_param
.
filter_meta
.
padding
[
0
];
size_t
PW
=
kern_param
.
filter_meta
.
padding
[
1
];
size_t
IH2
,
IW2
,
OH2
,
OW2
;
get_rectified_size
(
kern_param
,
IH2
,
IW2
,
OH2
,
OW2
);
bool
need_src_copy_var
=
need_src_copy
(
kern_param
);
size_t
padding_group_size
=
IH2
*
IW2
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
],
channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
workspace_group_id
=
ncb_index
.
thread_id
;
const
int8_t
*
sptr
=
kern_param
.
src
<
int8_t
>
(
batch_id
,
group_id
,
channel_id
);
if
(
need_src_copy_var
)
{
int8_t
*
sptr_base
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
padding_group_size
;
std
::
memset
(
sptr_base
,
0
,
sizeof
(
int8_t
)
*
IH2
*
IW2
);
rep
(
ih
,
IH
)
{
std
::
memcpy
(
sptr_base
+
(
ih
+
PH
)
*
IW2
+
PW
,
sptr
+
ih
*
IW
,
sizeof
(
int8_t
)
*
IW
);
}
}
};
template
<
size_t
filter
,
BiasMode
bias_mode
,
bool
is_quantized
,
typename
Op
>
void
conv_kimpl
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
size_t
OH
=
kern_param
.
osz
[
0
];
size_t
OW
=
kern_param
.
osz
[
1
];
size_t
IH2
,
IW2
,
OH2
,
OW2
;
get_rectified_size
(
kern_param
,
IH2
,
IW2
,
OH2
,
OW2
);
bool
need_src_copy_var
=
need_src_copy
(
kern_param
);
bool
need_dst_copy_var
=
need_dst_copy
(
kern_param
);
bool
need_post_process
=
kern_param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
Op
op
=
Op
(
1.0
f
,
4.0
f
);
if
(
need_post_process
)
{
float
scale_bias
=
kern_param
.
bias_type
.
param
<
dtype
::
QuantizedS32
>
().
scale
;
float
scale_dst
=
kern_param
.
dst_type
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
op
=
Op
(
scale_bias
,
scale_dst
);
}
size_t
padding_group_size
=
IH2
*
IW2
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
size_t
workspace_group_id
=
ncb_index
.
thread_id
;
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
];
const
int8_t
*
sptr
=
kern_param
.
src
<
dt_int8
>
(
batch_id
,
group_id
);
const
int8_t
*
fptr
=
kern_param
.
filter
<
dt_int8
>
(
group_id
);
void
*
dst
=
kern_param
.
dst
<
void
>
(
batch_id
,
group_id
);
const
int32_t
*
bptr
=
kern_param
.
bias
<
dt_int32
>
(
batch_id
,
group_id
);
if
(
need_src_copy_var
)
{
sptr
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
padding_group_size
;
}
void
*
dptr
=
nullptr
;
int32_t
*
tptr
=
nullptr
;
if
(
need_dst_copy_var
)
{
dptr
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
bundle
.
get
(
1
))
+
ncb_index
.
thread_id
*
OH2
*
OW2
*
kern_param
.
dst_type
.
size
());
}
else
{
dptr
=
dst
;
}
#define KERN_NEED_POST_PROCESS(filter) \
avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, true, \
Op>( \
sptr, fptr, bptr, tptr, static_cast<int8_t*>(dptr), IH2, IW2, OH2, \
OW2, op)
#define KERN_NO_POST_PROCESS(filter) \
avx2_chanwise_direct_stride1_##filter##x##filter##_int8<bias_mode, false, \
Op>( \
sptr, fptr, bptr, static_cast<int32_t*>(dptr), nullptr, IH2, IW2, \
OH2, OW2, op)
if
(
need_post_process
)
{
tptr
=
static_cast
<
int32_t
*>
(
bundle
.
get
(
2
))
+
ncb_index
.
thread_id
*
OH2
*
OW2
*
kern_param
.
dst_type
.
size
();
DISPATCH_FILTER
(
filter
,
KERN_NEED_POST_PROCESS
)
}
else
{
DISPATCH_FILTER
(
filter
,
KERN_NO_POST_PROCESS
)
}
#undef KERN_NEED_POST_PROCESS
#undef KERN_NO_POST_PROCESS
if
(
need_dst_copy_var
)
{
rep
(
oh
,
OH
)
{
std
::
memcpy
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
dst
)
+
oh
*
OW
*
kern_param
.
dst_type
.
size
()),
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
dptr
)
+
oh
*
OW2
*
kern_param
.
dst_type
.
size
()),
kern_param
.
dst_type
.
size
()
*
OW
);
}
}
};
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
kern_param
,
WorkspaceBundle
bundle
)
{
MEGDNN_MARK_USED_VAR
(
kern_param
);
auto
fm
=
kern_param
.
filter_meta
;
size_t
group
=
fm
.
group
;
size_t
n
=
kern_param
.
n
;
SmallVector
<
NCBKern
>
ncb_kerns
;
conv_fun
do_conv_fun
=
nullptr
;
#define DO_CONV_KERN_FUN(filter, bias_mode, is_quantized, op) \
do_conv_fun = conv_kimpl<filter, bias_mode, is_quantized, op>;
#define GET_OP_PARAM(i, bias_mode, is_quantized) \
switch (kern_param.nonlineMode) { \
case param::ConvBias::NonlineMode::IDENTITY: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
TypeCvtOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
case param::ConvBias::NonlineMode::RELU: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
ReluOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
case param::ConvBias::NonlineMode::H_SWISH: \
DO_CONV_KERN_FUN(i, bias_mode, is_quantized, \
HSwishOp<SIMDType::AVX2 MEGDNN_COMMA dt_qint32 \
MEGDNN_COMMA dt_qint8>) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define GET_BIAS_MODE_PARAM(i, is_quantized) \
switch (kern_param.bias_mode) { \
case BiasMode::NO_BIAS: \
GET_OP_PARAM(i, BiasMode::NO_BIAS, is_quantized) \
break; \
case BiasMode::BROADCAST_CHANNEL_BIAS: \
GET_OP_PARAM(i, BiasMode::BROADCAST_CHANNEL_BIAS, is_quantized) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define GET_QUANTIZED(i) \
switch (kern_param.dst_type.enumv()) { \
case DTypeEnum::QuantizedS8: \
GET_BIAS_MODE_PARAM(i, true) \
break; \
case DTypeEnum::QuantizedS32: \
GET_BIAS_MODE_PARAM(i, false) \
break; \
case DTypeEnum::Int32: \
GET_BIAS_MODE_PARAM(i, false) \
break; \
default: \
megdnn_assert(0); \
break; \
}
#define DISPATCH_CONV_KERN() \
switch (kern_param.filter_meta.spatial[0]) { \
case 2: \
GET_QUANTIZED(2) \
break; \
case 3: \
GET_QUANTIZED(3) \
break; \
case 5: \
GET_QUANTIZED(5) \
break; \
case 7: \
GET_QUANTIZED(7) \
break; \
default: \
megdnn_assert(0); \
break; \
}
DISPATCH_CONV_KERN
();
auto
exec_one_group
=
[
bundle
,
do_conv_fun
](
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
copy_padding_kern
(
bundle
,
kern_param
,
ncb_index
);
do_conv_fun
(
bundle
,
kern_param
,
ncb_index
);
};
ncb_kerns
.
push_back
({
exec_one_group
,
{
group
,
n
,
1
_z
}});
return
ncb_kerns
;
}
}
// namespace avx2_chanwise_stride1
}
// namespace x86
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/x86/conv_bias/int8/avx2_chanwise_stride1.h
0 → 100644
浏览文件 @
90ca8554
/**
* \file src/x86/conv_bias/int8/avx2_chanwsie_stride1.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/x86/conv_bias/opr_impl.h"
namespace
megdnn
{
namespace
x86
{
namespace
avx2_chanwise_stride1
{
using
NCBKern
=
fallback
::
ConvBiasImpl
::
NCBKern
;
using
NCBKernSizeParam
=
fallback
::
ConvBiasImpl
::
NCBKernSizeParam
;
using
NCBKernParam
=
fallback
::
ConvBiasImpl
::
NCBKernParam
;
using
NCBKernIndex
=
fallback
::
ConvBiasImpl
::
NCBKernIndex
;
using
conv_fun
=
std
::
function
<
void
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
>
;
bool
need_dst_copy
(
const
NCBKernSizeParam
&
param
);
bool
need_src_copy
(
const
NCBKernSizeParam
&
param
);
void
get_rectified_size
(
const
NCBKernSizeParam
&
param
,
size_t
&
IH2
,
size_t
&
IW2
,
size_t
&
OH2
,
size_t
&
OW2
);
SmallVector
<
NCBKern
>
get_kimpls
(
const
NCBKernSizeParam
&
param
,
WorkspaceBundle
bundle
);
}
// namespace avx2_chanwise_stride1
}
// namespace x86
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
浏览文件 @
90ca8554
...
...
@@ -10,7 +10,6 @@
*/
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride1.h"
#include "src/common/unroll_macro.h"
#include "src/x86/conv_bias/int8/common_helper.h"
#include "src/x86/conv_bias/postprocess_helper.h"
...
...
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
浏览文件 @
90ca8554
...
...
@@ -10,7 +10,6 @@
*/
#include "src/x86/conv_bias/int8/avx2_direct_conv_stride2.h"
#include "src/common/unroll_macro.h"
#include "src/x86/conv_bias/int8/common_helper.h"
#include "src/x86/conv_bias/postprocess_helper.h"
...
...
dnn/src/x86/conv_bias/int8/common_helper.h
浏览文件 @
90ca8554
...
...
@@ -11,6 +11,7 @@
#pragma once
#include <immintrin.h>
#include "src/common/unroll_macro.h"
#include "megdnn/arch.h"
#ifdef WIN32CMAKE
#include <smmintrin.h>
...
...
dnn/src/x86/conv_bias/opr_impl.cpp
浏览文件 @
90ca8554
...
...
@@ -65,6 +65,10 @@ void* ConvBiasImpl::AlgoAVX2DirectConvStride2::type() const {
return
x86_algo_type
;
}
void
*
ConvBiasImpl
::
AlgoChanWiseAvx2Stride1Qint8
::
type
()
const
{
return
x86_algo_type
;
}
class
ConvBiasImpl
::
AlgoPack
:
NonCopyableObj
{
AlgoDirect
stride1_direct_large_group
{
true
};
AlgoDirect
stride1_direct_small_group
{
false
};
...
...
@@ -72,6 +76,7 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
AlgoDirectStride2
stride2_direct_small_group
{
false
};
AlgoDirectAvx2Stride1Int8
avx2_stride1_direct_int8
;
AlgoAVX2DirectConvStride2
avx2_stride2_direct
;
AlgoChanWiseAvx2Stride1Qint8
avx2_stride1_chanwsie_qint8
;
AlgoMatrixMul
matmul
;
#if defined(MEGDNN_X86_WITH_MKL_DNN)
AlgoMkldnnMatmulQint8
mkldnn_matmul_qint8
;
...
...
@@ -94,6 +99,7 @@ public:
all_algos
.
emplace_back
(
&
stride2_direct_small_group
);
all_algos
.
emplace_back
(
&
avx2_stride1_direct_int8
);
all_algos
.
emplace_back
(
&
avx2_stride2_direct
);
all_algos
.
emplace_back
(
&
avx2_stride1_chanwsie_qint8
);
all_algos
.
emplace_back
(
&
matmul
);
static
CpuOprDelegationStorage
<>
storage
;
...
...
dnn/src/x86/conv_bias/opr_impl.h
浏览文件 @
90ca8554
...
...
@@ -31,6 +31,7 @@ public:
class
AlgoMatrixMul
;
class
AlgoDirectAvx2Stride1Int8
;
class
AlgoAVX2DirectConvStride2
;
class
AlgoChanWiseAvx2Stride1Qint8
;
#if defined(MEGDNN_X86_WITH_MKL_DNN)
class
AlgoMkldnnConv
;
class
AlgoMkldnnQint8
;
...
...
dnn/src/x86/elemwise_helper/kimpl/typecvt.h
浏览文件 @
90ca8554
...
...
@@ -257,6 +257,32 @@ struct TypeCvtOp<SIMDType::SSE4_2, dt_qint32, dt_qint8>
}
};
template
<
>
struct
TypeCvtOp
<
SIMDType
::
AVX2
,
dt_qint32
,
dt_qint8
>
:
UnaryOpBase
<
SIMDType
::
AVX2
,
dt_qint32
,
dt_qint8
>
{
using
UnaryOpBase
::
UnaryOpBase
;
constexpr
static
size_t
SIMD_WIDTH
=
8
;
MEGDNN_ATTRIBUTE_TARGET
(
"avx2"
)
void
operator
()(
const
__m256ix2
&
vsrc
,
dt_qint8
*
dst
)
const
{
_mm_store_si128
((
__m128i
*
)(
dst
),
(
operator
()(
vsrc
)));
}
MEGDNN_ATTRIBUTE_TARGET
(
"avx2"
)
__m128i
operator
()(
const
__m256ix2
&
vsrc
)
const
{
auto
cvtps_src0
=
_mm256_cvtepi32_ps
(
vsrc
.
val
[
0
]);
auto
cvtps_src1
=
_mm256_cvtepi32_ps
(
vsrc
.
val
[
1
]);
auto
vitem0
=
_mm256_mul_ps
(
cvtps_src0
,
_mm256_set1_ps
(
this
->
scale
));
auto
vitem1
=
_mm256_mul_ps
(
cvtps_src1
,
_mm256_set1_ps
(
this
->
scale
));
return
QConverter
::
convert
<
__m128i
,
__m256x2
>
({{
vitem0
,
vitem1
}});
}
void
operator
()(
src_ctype
src
,
dst_ctype
*
dst
)
{
*
reinterpret_cast
<
int8_t
*>
(
dst
)
=
saturate
<
int8_t
,
float
>
(
std
::
round
(
src
.
as_int32
()
*
scale
),
-
128
,
127
);
}
};
template
<
>
struct
TypeCvtOp
<
SIMDType
::
SSE4_2
,
dt_float32
,
dt_qint8
>
:
UnaryOpBase
<
SIMDType
::
SSE4_2
,
dt_float32
,
dt_qint8
>
{
...
...
dnn/test/x86/conv_bias.cpp
浏览文件 @
90ca8554
...
...
@@ -40,6 +40,165 @@ TEST_F(X86, CONV_BIAS_FORWARD) {
.
execs
({
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_INT8x8x32
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
auto
run
=
[
&
](
size_t
ic
,
size_t
w
,
size_t
h
,
size_t
kernel
,
size_t
p
,
NonlineMode
nonline_mode
)
{
if
(
w
+
2
*
p
<
kernel
||
h
+
2
*
p
<
kernel
)
return
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
nonlineMode
=
nonline_mode
;
param
.
sparse
=
param
::
ConvBias
::
Sparse
::
GROUP
;
//! no bias
args
.
emplace_back
(
param
,
TensorShape
{
2
,
ic
,
h
,
w
},
TensorShape
{
ic
,
1
,
1
,
kernel
,
kernel
},
TensorShape
{});
//! bias channel
args
.
emplace_back
(
param
,
TensorShape
{
2
,
ic
,
h
,
w
},
TensorShape
{
ic
,
1
,
1
,
kernel
,
kernel
},
TensorShape
{
1
,
ic
,
1
,
1
});
};
for
(
size_t
kernel
:
{
2
,
3
,
5
,
7
})
for
(
size_t
pad
:
{
0
,
1
})
for
(
size_t
ic
:
{
1
,
5
,
17
,
20
})
for
(
size_t
h
:
{
7
,
16
,
38
,
40
})
for
(
size_t
w
:
{
16
,
25
,
40
,
55
})
for
(
NonlineMode
nonline_mode
:
{
NonlineMode
::
IDENTITY
})
run
(
ic
,
w
,
h
,
kernel
,
pad
,
nonline_mode
);
Checker
<
ConvBias
>
checker
(
handle
());
UniformIntRNG
rng
{
-
50
,
50
};
checker
.
set_dtype
(
0
,
dtype
::
Int8
())
.
set_dtype
(
1
,
dtype
::
Int8
())
.
set_dtype
(
2
,
dtype
::
Int32
())
.
set_dtype
(
4
,
dtype
::
Int32
())
.
set_rng
(
0
,
&
rng
)
.
set_rng
(
1
,
&
rng
)
.
set_rng
(
2
,
&
rng
)
.
set_epsilon
(
1e-3
);
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
));
for
(
auto
&&
arg
:
args
)
{
checker
.
set_param
(
arg
.
param
).
exec
(
{
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS32
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
auto
run
=
[
&
](
size_t
ic
,
size_t
w
,
size_t
h
,
size_t
kernel
,
size_t
p
,
NonlineMode
nonline_mode
)
{
if
(
w
+
2
*
p
<
kernel
||
h
+
2
*
p
<
kernel
)
return
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
nonlineMode
=
nonline_mode
;
param
.
sparse
=
param
::
ConvBias
::
Sparse
::
GROUP
;
//! no bias
args
.
emplace_back
(
param
,
TensorShape
{
2
,
ic
,
h
,
w
},
TensorShape
{
ic
,
1
,
1
,
kernel
,
kernel
},
TensorShape
{});
//! bias channel
args
.
emplace_back
(
param
,
TensorShape
{
2
,
ic
,
h
,
w
},
TensorShape
{
ic
,
1
,
1
,
kernel
,
kernel
},
TensorShape
{
1
,
ic
,
1
,
1
});
};
for
(
size_t
kernel
:
{
2
,
3
,
5
,
7
})
for
(
size_t
pad
:
{
0
,
1
})
for
(
size_t
ic
:
{
1
,
3
,
5
,
7
,
17
})
for
(
size_t
h
:
{
10
,
17
,
25
,
30
})
for
(
size_t
w
:
{
19
,
28
,
58
,
168
})
for
(
NonlineMode
nonline_mode
:
{
NonlineMode
::
IDENTITY
})
run
(
ic
,
w
,
h
,
kernel
,
pad
,
nonline_mode
);
Checker
<
ConvBias
>
checker
(
handle
());
UniformIntRNG
rng
{
-
50
,
50
};
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
2
,
dtype
::
QuantizedS32
(
6.25
f
))
.
set_dtype
(
4
,
{})
.
set_rng
(
0
,
&
rng
)
.
set_rng
(
1
,
&
rng
)
.
set_rng
(
2
,
&
rng
)
.
set_epsilon
(
1e-3
);
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
));
for
(
auto
&&
arg
:
args
)
{
checker
.
set_param
(
arg
.
param
).
exec
(
{
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CHANWISE_DIRECT_STRIDE1_QuantizedS8x8x8
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
auto
run
=
[
&
](
size_t
ic
,
size_t
w
,
size_t
h
,
size_t
kernel
,
size_t
p
,
NonlineMode
nonline_mode
)
{
if
(
w
+
2
*
p
<
kernel
||
h
+
2
*
p
<
kernel
)
return
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
nonlineMode
=
nonline_mode
;
param
.
sparse
=
param
::
ConvBias
::
Sparse
::
GROUP
;
//! no bias
args
.
emplace_back
(
param
,
TensorShape
{
2
,
ic
,
h
,
w
},
TensorShape
{
ic
,
1
,
1
,
kernel
,
kernel
},
TensorShape
{});
//! bias channel
args
.
emplace_back
(
param
,
TensorShape
{
2
,
ic
,
h
,
w
},
TensorShape
{
ic
,
1
,
1
,
kernel
,
kernel
},
TensorShape
{
1
,
ic
,
1
,
1
});
};
for
(
size_t
kernel
:
{
2
,
3
,
5
,
7
})
for
(
size_t
pad
:
{
0
,
1
})
for
(
size_t
ic
:
{
1
,
3
,
5
,
7
,
17
})
for
(
size_t
h
:
{
10
,
15
,
17
,
30
})
for
(
size_t
w
:
{
19
,
28
,
58
,
168
})
for
(
NonlineMode
nonline_mode
:
{
NonlineMode
::
IDENTITY
,
NonlineMode
::
H_SWISH
,
NonlineMode
::
RELU
})
run
(
ic
,
w
,
h
,
kernel
,
pad
,
nonline_mode
);
Checker
<
ConvBias
>
checker
(
handle
());
UniformIntRNG
rng
{
-
50
,
50
};
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
1
,
dtype
::
QuantizedS8
(
2.5
f
))
.
set_dtype
(
2
,
dtype
::
QuantizedS32
(
6.25
f
))
.
set_dtype
(
4
,
dtype
::
QuantizedS8
(
60.25
f
))
.
set_rng
(
0
,
&
rng
)
.
set_rng
(
1
,
&
rng
)
.
set_rng
(
2
,
&
rng
)
.
set_epsilon
(
1e-3
);
checker
.
set_before_exec_callback
(
conv_bias
::
ConvBiasAlgoChecker
<
ConvBiasForward
>
(
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
));
for
(
auto
&&
arg
:
args
)
{
checker
.
set_param
(
arg
.
param
).
exec
(
{
arg
.
src
,
arg
.
filter
,
arg
.
bias
,
{},
{}});
}
}
TEST_F
(
X86_MULTI_THREADS
,
AVX2_CONV_BIAS_DIRECT_STRIDE1_INT8x8x32
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
...
...
@@ -1556,6 +1715,67 @@ void benchmark_impl_comp(const param::ConvBias param,
}
}
// namespace
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_CHANWISE_AVX2_INT8
)
{
constexpr
size_t
RUNS
=
50
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
sparse
=
param
::
ConvBias
::
Sparse
::
GROUP
;
std
::
vector
<
DType
>
data_type
=
{
dtype
::
Int8
(),
dtype
::
Int8
(),
dtype
::
Int32
(),
dtype
::
Int32
()};
std
::
vector
<
std
::
pair
<
SmallVector
<
TensorShape
>
,
float
>>
shapes_and_computation
;
auto
bench_case
=
[
&
](
size_t
N
,
size_t
IC
,
size_t
H
,
size_t
W
,
size_t
FS
)
{
param
.
pad_h
=
FS
/
2
;
param
.
pad_w
=
FS
/
2
;
SmallVector
<
TensorShape
>
shapes
{
{
N
,
IC
,
H
,
W
},
{
IC
,
1
,
1
,
FS
,
FS
},
{},
{},
{}};
TensorShape
dst
{
N
,
IC
,
(
H
+
2
*
param
.
pad_h
-
FS
)
+
1
,
(
W
+
2
*
param
.
pad_w
-
FS
)
+
1
};
float
computations
=
(
FS
*
FS
*
dst
.
total_nr_elems
()
*
2
)
*
1e-6
;
shapes_and_computation
.
push_back
(
std
::
make_pair
(
shapes
,
computations
));
};
bench_case
(
1
,
32
,
112
,
112
,
7
);
bench_case
(
1
,
144
,
56
,
56
,
7
);
bench_case
(
1
,
192
,
28
,
28
,
7
);
bench_case
(
1
,
384
,
28
,
28
,
7
);
bench_case
(
1
,
576
,
14
,
14
,
7
);
bench_case
(
1
,
960
,
7
,
7
,
7
);
bench_case
(
1
,
32
,
112
,
112
,
5
);
bench_case
(
1
,
144
,
56
,
56
,
5
);
bench_case
(
1
,
192
,
28
,
28
,
5
);
bench_case
(
1
,
384
,
28
,
28
,
5
);
bench_case
(
1
,
576
,
14
,
14
,
5
);
bench_case
(
1
,
960
,
7
,
7
,
5
);
bench_case
(
1
,
32
,
112
,
112
,
3
);
bench_case
(
1
,
144
,
56
,
56
,
3
);
bench_case
(
1
,
192
,
28
,
28
,
3
);
bench_case
(
1
,
384
,
28
,
28
,
3
);
bench_case
(
1
,
576
,
14
,
14
,
3
);
bench_case
(
1
,
960
,
7
,
7
,
3
);
bench_case
(
1
,
32
,
112
,
112
,
2
);
bench_case
(
1
,
144
,
56
,
56
,
2
);
bench_case
(
1
,
192
,
28
,
28
,
2
);
bench_case
(
1
,
384
,
28
,
28
,
2
);
bench_case
(
1
,
576
,
14
,
14
,
2
);
bench_case
(
1
,
960
,
7
,
7
,
2
);
std
::
string
algo_name
=
"X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1"
;
printf
(
"Benchmark X86_CONV_BIAS_CHANWISE_AVX2_INT8_STRIDE1
\n
"
);
benchmark_impl
(
param
,
shapes_and_computation
,
algo_name
,
RUNS
,
{
4
,
{
4
,
5
,
6
,
7
}},
{
1
,
{
4
}},
data_type
);
benchmark_impl
(
param
,
shapes_and_computation
,
algo_name
,
RUNS
,
{
2
,
{
4
,
5
}},
{
1
,
{
4
}},
data_type
);
shapes_and_computation
.
clear
();
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_DIRECT_AVX2_INT8
)
{
constexpr
size_t
RUNS
=
50
;
param
::
ConvBias
param
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录