Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
b0ba6d32
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
399
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
b0ba6d32
编写于
10月 18, 2021
作者:
L
liuke
提交者:
Megvii Engine Team
10月 18, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #207 from togetherwhenyouwant:feat-x86-matmul-6x16x2
GitOrigin-RevId: 148ae44ba01e821111701607f65c591afeb2cefd
上级
a4ac5e7e
d2184af3
变更
13
展开全部
隐藏空白更改
内联
并排
Showing
13 changed file
with
1599 addition
and
6 deletion
+1599
-6
CMakeLists.txt
CMakeLists.txt
+1
-0
dnn/src/fallback/conv_bias/im2col/algos.cpp
dnn/src/fallback/conv_bias/im2col/algos.cpp
+6
-6
dnn/src/fallback/matrix_mul/opr_impl.h
dnn/src/fallback/matrix_mul/opr_impl.h
+1
-0
dnn/src/x86/avx_helper.h
dnn/src/x86/avx_helper.h
+9
-0
dnn/src/x86/matrix_mul/algos.cpp
dnn/src/x86/matrix_mul/algos.cpp
+66
-0
dnn/src/x86/matrix_mul/algos.h
dnn/src/x86/matrix_mul/algos.h
+11
-0
dnn/src/x86/matrix_mul/f32/strategy.h
dnn/src/x86/matrix_mul/f32/strategy.h
+3
-0
dnn/src/x86/matrix_mul/f32/strategy_6x16.cpp
dnn/src/x86/matrix_mul/f32/strategy_6x16.cpp
+1278
-0
dnn/src/x86/matrix_mul/opr_impl.cpp
dnn/src/x86/matrix_mul/opr_impl.cpp
+2
-0
dnn/src/x86/matrix_mul/opr_impl.h
dnn/src/x86/matrix_mul/opr_impl.h
+1
-0
dnn/test/x86/accuracy_shake.cpp
dnn/test/x86/accuracy_shake.cpp
+9
-0
dnn/test/x86/conv_bias.cpp
dnn/test/x86/conv_bias.cpp
+198
-0
dnn/test/x86/matrix_mul.cpp
dnn/test/x86/matrix_mul.cpp
+14
-0
未找到文件。
CMakeLists.txt
浏览文件 @
b0ba6d32
...
...
@@ -10,6 +10,7 @@ project(MegEngine LANGUAGES C CXX VERSION ${MGB_VER_STRING})
set
(
CMAKE_CXX_STANDARD 14
)
set
(
CMAKE_CXX_STANDARD_REQUIRED ON
)
set
(
CMAKE_CXX_EXTENSIONS OFF
)
set
(
CMAKE_EXPORT_COMPILE_COMMANDS ON
)
set
(
CMAKE_POSITION_INDEPENDENT_CODE ON
)
set
(
CMAKE_MODULE_PATH
${
PROJECT_SOURCE_DIR
}
/cmake/Modules
)
set
(
CMAKE_POLICY_DEFAULT_CMP0048 NEW
)
...
...
dnn/src/fallback/conv_bias/im2col/algos.cpp
浏览文件 @
b0ba6d32
...
...
@@ -8,11 +8,11 @@
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
#include "src/fallback/conv_bias/im2col/algos.h"
#include "megdnn/opr_param_defs.h"
#include "src/common/opr_delegate.h"
#include "src/fallback/conv_bias/common.h"
#include "src/fallback/conv_bias/im2col/algos.h"
#include "src/fallback/conv_bias/im2col/factory.h"
#include "src/fallback/conv_bias/im2col/im2col_kerns.h"
#include "src/fallback/conv_bias/opr_impl.h"
...
...
@@ -68,16 +68,16 @@ static void choice_ohw_oc_block(
fallback
::
MatrixMulImpl
::
AlgoBase
::
PackMode
pack_mode
)
{
//! calculate m_oc_tile_size in choice_ohw_oc_block() fucntion,
//! when ohw_tile_size < this value ohw_tile_size = ohw
s
tatic
constexpr
size_t
DEFAULT_OHW_MIN_TILE_SIZE
=
32
;
s
ize_t
DEFAULT_OHW_MIN_TILE_SIZE
=
round_up
(
static_cast
<
size_t
>
(
32
),
block_n
)
;
//! when nr_threads > 1 and round(ohw,nr_threads)>nr_threads,
//! oc_tile_size = DEFAULT_OC_TILE_SIZE
s
tatic
constexpr
size_t
DEFAULT_OC_TILE_SIZE
=
512
;
s
ize_t
DEFAULT_OC_TILE_SIZE
=
round_up
(
static_cast
<
size_t
>
(
512
),
block_m
)
;
//! when oc_tile_size > this value m_oc_tile_size =
//! DEFAULT_OC_MAX_TILE_SIZE
s
tatic
constexpr
size_t
DEFAULT_OC_MAX_TILE_SIZE
=
1024
;
s
ize_t
DEFAULT_OC_MAX_TILE_SIZE
=
round_up
(
static_cast
<
size_t
>
(
1024
),
block_m
)
;
//! when oc_tile_size < this value oc_tile_size =
//! DEFAULT_OC_MIN_TILE_SIZE the purpose is aligning the calculation
s
tatic
constexpr
size_t
DEFAULT_OC_MIN_TILE_SIZE
=
128
;
s
ize_t
DEFAULT_OC_MIN_TILE_SIZE
=
round_up
(
static_cast
<
size_t
>
(
128
),
block_m
)
;
size_t
nr_threads
=
param
.
nr_threads
;
size_t
OC
=
param
.
filter_meta
.
ocpg
;
size_t
ohw
=
param
.
osz
[
0
]
*
param
.
osz
[
1
];
...
...
dnn/src/fallback/matrix_mul/opr_impl.h
浏览文件 @
b0ba6d32
...
...
@@ -123,6 +123,7 @@ public:
X86_INT8X8X16_SSE
,
X86_INT8X8X32_SSE_4X8X2
,
X86_F32_MK8_8X8
,
X86_F32_6x16
,
X86_INT8X8X32_VNNI
,
X86_INT8X8X32_MKLDNN
,
#elif MEGDNN_AARCH64 || MEGDNN_ARMV7
...
...
dnn/src/x86/avx_helper.h
浏览文件 @
b0ba6d32
...
...
@@ -33,6 +33,15 @@ static inline __m256 _mm256_loadu2_m128_emulate(
_mm256_castps128_ps256
(
_mm_loadu_ps
(
loaddr
)),
_mm_loadu_ps
(
hiaddr
),
1
);
}
MEGDNN_ATTRIBUTE_TARGET
(
"avx"
)
static
inline
void
_mm256_storeu2_m128_emulate
(
float
*
hiaddr
,
float
*
loaddr
,
__m256
reg
)
{
auto
xmm0
=
_mm256_extractf128_ps
(
reg
,
0
);
auto
xmm1
=
_mm256_extractf128_ps
(
reg
,
1
);
_mm_storeu_ps
(
loaddr
,
xmm0
);
_mm_storeu_ps
(
hiaddr
,
xmm1
);
}
template
<
typename
ctype
,
size_t
len
>
struct
Vector
;
...
...
dnn/src/x86/matrix_mul/algos.cpp
浏览文件 @
b0ba6d32
...
...
@@ -309,6 +309,33 @@ void gemm_s8s8s32_sse_4x8x2(const MatrixMulImpl::KernParam& kern_param) {
MIDOUT_END
();
}
void
gemm_f32_avx2_6x16
(
const
MatrixMulImpl
::
KernParam
&
kern_param
)
{
MEGDNN_MARK_USED_VAR
(
kern_param
);
MIDOUT_BEGIN
(
megdnn_x86_matmul_kern_avx2_6x16x2
,
midout_iv
(
0
))
{
constexpr
int
cacheline
=
64
;
const
size_t
m
=
kern_param
.
M
;
const
size_t
n
=
kern_param
.
N
;
const
size_t
k
=
kern_param
.
K
;
const
bool
trans_a
=
kern_param
.
trA
;
const
bool
trans_b
=
kern_param
.
trB
;
const
size_t
lda
=
kern_param
.
LDA
;
const
size_t
ldb
=
kern_param
.
LDB
;
const
size_t
ldc
=
kern_param
.
LDC
;
auto
a_type
=
kern_param
.
A_type
;
auto
b_type
=
kern_param
.
B_type
;
auto
c_type
=
kern_param
.
C_type
;
const
auto
a_ptr
=
kern_param
.
A
<
float
>
();
const
auto
b_ptr
=
kern_param
.
B
<
float
>
();
auto
c_ptr
=
kern_param
.
C
<
float
>
();
x86
::
matmul
::
sgemm_pack_6x16_avx2
strategy
(
m
,
n
,
k
,
a_type
,
b_type
,
c_type
);
megdnn
::
matmul
::
GemmInterleaved
<
x86
::
matmul
::
sgemm_pack_6x16_avx2
>
(
m
,
n
,
k
,
trans_a
,
trans_b
,
strategy
,
cacheline
)
.
execute
(
a_ptr
,
lda
,
b_ptr
,
ldb
,
c_ptr
,
ldc
,
kern_param
.
workspace_ptr
);
}
MIDOUT_END
();
}
}
// namespace
/*************************AlgoInt8x8x16AVX2********************/
...
...
@@ -625,4 +652,43 @@ size_t MatrixMulImpl::AlgoF32MK8_8x8::get_workspace(
MIDOUT_END
();
}
/*************************AlgoFloatAVX2M6N16********************/
MatrixMulImpl
::
kern_t
MatrixMulImpl
::
AlgoFloatAVX2M6N16
::
get_kern
(
const
KernSizeParam
&
)
const
{
return
gemm_f32_avx2_6x16
;
}
bool
MatrixMulImpl
::
AlgoFloatAVX2M6N16
::
usable
(
const
KernSizeParam
&
kern_size_param
)
const
{
bool
is_param_ok
=
kern_size_param
.
A_type
.
enumv
()
==
kern_size_param
.
B_type
.
enumv
()
&&
((
kern_size_param
.
A_type
.
enumv
()
==
DTypeEnum
::
Float32
&&
kern_size_param
.
C_type
.
enumv
()
==
DTypeEnum
::
Float32
))
&&
kern_size_param
.
compute_mode
==
Param
::
ComputeMode
::
DEFAULT
&&
kern_size_param
.
format
==
Param
::
Format
::
DEFAULT
&&
is_supported
(
SIMDType
::
AVX2
);
return
is_param_ok
;
}
size_t
MatrixMulImpl
::
AlgoFloatAVX2M6N16
::
get_workspace
(
const
KernSizeParam
&
kern_param
)
const
{
constexpr
int
cacheline
=
64
;
const
size_t
m
=
kern_param
.
M
;
const
size_t
n
=
kern_param
.
N
;
const
size_t
k
=
kern_param
.
K
;
const
bool
trans_a
=
kern_param
.
trA
;
const
bool
trans_b
=
kern_param
.
trB
;
auto
a_type
=
kern_param
.
A_type
;
auto
b_type
=
kern_param
.
B_type
;
auto
c_type
=
kern_param
.
C_type
;
x86
::
matmul
::
sgemm_pack_6x16_avx2
strategy
(
m
,
n
,
k
,
a_type
,
b_type
,
c_type
);
return
megdnn
::
matmul
::
GemmInterleaved
<
x86
::
matmul
::
sgemm_pack_6x16_avx2
>
(
m
,
n
,
k
,
trans_a
,
trans_b
,
strategy
,
cacheline
)
.
get_workspace_size
();
}
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL
(
AlgoFloatAVX2M6N16
,
megdnn_x86_matmul_kern
,
"AlgoFloatAVX2M6N16"
_hash
,
x86
::
matmul
::
sgemm_pack_6x16_avx2
,
float
,
float
,
float
,
AlgoDataType
::
FLOAT32
,
DEFAULT
);
// vim: syntax=cpp.doxygen
dnn/src/x86/matrix_mul/algos.h
浏览文件 @
b0ba6d32
...
...
@@ -134,6 +134,17 @@ public:
MEGDNN_DECL_ALGO_TYPE
(
X86_F32_MK8_8X8
)
};
class
MatrixMulImpl
::
AlgoFloatAVX2M6N16
:
public
AlgoBase
{
public:
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
;
}
const
char
*
name
()
const
override
{
return
"X86_F32_6x16"
;
}
bool
usable
(
const
KernSizeParam
&
)
const
override
;
size_t
get_workspace
(
const
KernSizeParam
&
)
const
override
;
kern_t
get_kern
(
const
KernSizeParam
&
)
const
override
;
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL
();
MEGDNN_DECL_ALGO_TYPE
(
X86_F32_6x16
)
};
#if MEGDNN_X86_WITH_VNNI
class
MatrixMulImpl
::
AlgoInt8x8x32Vnni
:
public
AlgoBase
{
public:
...
...
dnn/src/x86/matrix_mul/f32/strategy.h
浏览文件 @
b0ba6d32
...
...
@@ -19,6 +19,9 @@ namespace matmul {
MEGDNN_REG_GEMM_STRATEGY_NOPACK
(
float
,
float
,
float
,
8
,
8
,
8
,
false
,
true
,
sgemm_nopack_8x8_avx2
);
MEGDNN_REG_GEMM_STRATEGY_WITH_PACK_A_TYPE
(
float
,
float
,
float
,
float
,
6
,
16
,
1
,
false
,
false
,
sgemm_pack_6x16_avx2
);
}
// namespace matmul
}
// namespace x86
}
// namespace megdnn
\ No newline at end of file
dnn/src/x86/matrix_mul/f32/strategy_6x16.cpp
0 → 100644
浏览文件 @
b0ba6d32
此差异已折叠。
点击以展开。
dnn/src/x86/matrix_mul/opr_impl.cpp
浏览文件 @
b0ba6d32
...
...
@@ -34,6 +34,7 @@ class MatrixMulImpl::AlgoPack : NonCopyableObj {
AlgoInt8x8x16AVX2
algoint8x8x16avx2_m4n16k2
;
AlgoInt8x8x16SSE
algoint8x8x16sse_m4n8k2
;
AlgoF32MK8_8x8
algof32mk8_8x8
;
AlgoFloatAVX2M6N16
algof32_6x16
;
SmallVector
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
m_all_algos
;
fallback
::
MatrixMulImpl
::
AlgoBase
::
Mapper
m_all_algos_map
;
...
...
@@ -51,6 +52,7 @@ public:
m_all_algos
.
emplace_back
(
&
algoint8x8x32sse_m4n8k2
);
m_all_algos
.
emplace_back
(
&
algoint8x8x16sse_m4n8k2
);
m_all_algos
.
emplace_back
(
&
algof32mk8_8x8
);
m_all_algos
.
emplace_back
(
&
algof32_6x16
);
#if MEGDNN_X86_WITH_MKL_DNN
m_all_algos
.
emplace_back
(
&
algoint8x8x32mkldnn
);
#endif
...
...
dnn/src/x86/matrix_mul/opr_impl.h
浏览文件 @
b0ba6d32
...
...
@@ -67,6 +67,7 @@ private:
class
AlgoInt8x8x16SSE
;
class
AlgoPack
;
class
AlgoF32MK8_8x8
;
class
AlgoFloatAVX2M6N16
;
public:
static
const
AlgoPack
&
algo_pack
();
...
...
dnn/test/x86/accuracy_shake.cpp
浏览文件 @
b0ba6d32
...
...
@@ -84,6 +84,15 @@ TEST_F(X86, SHAKE_MATRIX_MUL_FORWARD) {
.
exec
({{
20
,
100
},
{
100
,
60
},
{}});
}
TEST_F
(
X86
,
SHAKE_MATRIX_MUL_6x16_FORWARD
)
{
AccuracyShakeChecker
<
MatrixMul
>
checker
(
handle
());
checker
.
set_before_exec_callback
(
AlgoGenerator
<
MatrixMul
>
(
"X86_F32_6x16"
));
checker
.
set_dtype
(
0
,
dtype
::
Float32
())
.
set_dtype
(
1
,
dtype
::
Float32
())
.
set_dtype
(
2
,
dtype
::
Float32
())
.
exec
({{
20
,
100
},
{
100
,
60
},
{}});
}
}
// namespace test
}
// namespace megdnn
...
...
dnn/test/x86/conv_bias.cpp
浏览文件 @
b0ba6d32
...
...
@@ -1150,6 +1150,56 @@ TEST_F(X86, CONV_BIAS_IM2COLMATMUL_FP32_NOPACK_PREPROCESS) {
#endif
TEST_F
(
X86_MULTI_THREADS
,
CONV_BIAS_IM2COLMATMUL_FP32_6x16
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
auto
run
=
[
&
](
size_t
oc
,
size_t
ic
,
size_t
w
,
size_t
h
,
size_t
kernel
,
size_t
p
,
NonlineMode
nonline_mode
)
{
if
(
w
+
2
*
p
<
kernel
||
h
+
2
*
p
<
kernel
)
return
;
param
::
ConvBias
param
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
nonlineMode
=
nonline_mode
;
//! no bias
args
.
emplace_back
(
param
,
TensorShape
{
1
,
ic
,
h
,
w
},
TensorShape
{
oc
,
ic
,
kernel
,
kernel
},
TensorShape
{});
args
.
emplace_back
(
param
,
TensorShape
{
1
,
ic
,
h
,
w
},
TensorShape
{
oc
,
ic
,
kernel
,
kernel
},
TensorShape
{
1
,
oc
,
1
,
1
});
args
.
emplace_back
(
param
,
TensorShape
{
1
,
ic
,
h
,
w
},
TensorShape
{
oc
,
ic
,
kernel
,
kernel
},
TensorShape
{
1
,
oc
,
(
h
+
2
*
p
-
kernel
)
/
param
.
stride_h
+
1
,
(
w
+
2
*
p
-
kernel
)
/
param
.
stride_w
+
1
});
};
for
(
size_t
kernel
:
{
2
,
3
,
4
,
5
,
6
,
7
})
for
(
size_t
ic
:
{
1
,
4
,
8
,
16
})
for
(
size_t
oc
:
{
1
,
4
,
8
,
16
,
300
})
for
(
size_t
p
:
{
0
,
2
})
for
(
size_t
size
:
{
8
,
24
})
for
(
NonlineMode
nonline_mode
:
{
NonlineMode
::
IDENTITY
,
NonlineMode
::
RELU
})
{
run
(
oc
,
ic
,
size
,
size
,
kernel
,
p
,
nonline_mode
);
}
run
(
2046
,
8
,
20
,
20
,
3
,
1
,
NonlineMode
::
IDENTITY
);
Checker
<
ConvBias
>
checker
(
handle
());
#define cb(algo_name) \
checker.set_before_exec_callback( \
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); \
for (auto&& arg : args) { \
checker.set_param(arg.param).execs({arg.src, arg.filter, arg.bias, {}, {}}); \
}
cb
(
"IM2COLMATMUL:X86_F32_6x16:192"
);
}
#if MEGDNN_X86_WITH_MKL && SUPPORT_MKL_PACKED_GEMM
TEST_F
(
X86_MULTI_THREADS
,
CONV_BIAS_IM2COLMATMUL_FP32_PACKA
)
{
using
namespace
conv_bias
;
...
...
@@ -1435,6 +1485,12 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32_PREPROCESS) {
#endif
TEST_F
(
X86_MULTI_THREADS
,
CONV_BIAS_CONV1X1_S1_FP32_6x16
)
{
using
namespace
conv_bias
;
std
::
vector
<
conv_bias
::
TestArg
>
args
=
get_conv_bias_1x1_args
(
false
,
false
);
check_conv_bias
(
args
,
handle
(),
"CONV1x1:X86_F32_6x16:48"
);
}
TEST_F
(
X86_MULTI_THREADS
,
CONV_BIAS_IM2COLMATMUL_QINT8
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
;
...
...
@@ -2651,6 +2707,148 @@ TEST_F(X86_BENCHMARK_MULTI_THREADS, BENCHMARK_CONVBIAS_IM2COL_F32_single_thread)
shapes_and_computation
.
clear
();
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_IM2COL_F32_6x16
)
{
constexpr
size_t
RUNS
=
50
;
param
::
ConvBias
param
;
param
.
nonlineMode
=
param
::
ConvBias
::
NonlineMode
::
RELU
;
param
.
pad_h
=
1
;
param
.
pad_w
=
1
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
std
::
vector
<
DType
>
data_type
=
{
dtype
::
Float32
(),
dtype
::
Float32
(),
dtype
::
Float32
(),
dtype
::
Float32
()};
std
::
vector
<
std
::
pair
<
SmallVector
<
TensorShape
>
,
float
>>
shapes_and_computation
;
auto
bench_case
=
[
&
](
size_t
N
,
size_t
IC
,
size_t
OC
,
size_t
H
,
size_t
W
,
size_t
FS
,
size_t
group
)
{
SmallVector
<
TensorShape
>
shapes
{
{
N
,
IC
,
H
,
W
},
{
OC
/
group
,
IC
/
group
,
FS
,
FS
},
{
1
,
OC
,
1
,
1
},
{},
{
N
,
OC
,
H
,
W
}};
TensorShape
dst
{
N
,
OC
,
H
,
W
};
float
computations
=
((
IC
/
group
)
*
FS
*
FS
*
dst
.
total_nr_elems
()
*
2
+
dst
.
total_nr_elems
())
*
1e-6
;
shapes_and_computation
.
push_back
(
std
::
make_pair
(
shapes
,
computations
));
};
bench_case
(
1
,
32
,
32
,
200
,
200
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
200
,
200
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
128
,
128
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
128
,
128
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
100
,
100
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
100
,
100
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
80
,
80
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
80
,
80
,
3
,
1
);
bench_case
(
1
,
64
,
32
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
64
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
128
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
256
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
512
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
1024
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
32
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
64
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
128
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
256
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
512
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
1024
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
128
,
128
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
128
,
256
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
512
,
512
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
256
,
512
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
512
,
1024
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
1024
,
1024
,
14
,
14
,
3
,
1
);
std
::
string
algo_name
=
"IM2COLMATMUL:X86_F32_6x16:192"
;
printf
(
"Benchmark IM2COLMATMUL:X86_F32_6x16 algo
\n
"
);
benchmark_impl
(
param
,
shapes_and_computation
,
algo_name
,
RUNS
,
{
4
,
{
4
,
5
,
6
,
7
}},
{
1
,
{
4
}},
data_type
);
benchmark_impl
(
param
,
shapes_and_computation
,
algo_name
,
RUNS
,
{
4
,
{
4
,
5
,
6
,
7
}},
{
1
,
{
7
}},
data_type
);
benchmark_impl
(
param
,
shapes_and_computation
,
algo_name
,
RUNS
,
{
2
,
{
4
,
5
}},
{
1
,
{
4
}},
data_type
);
shapes_and_computation
.
clear
();
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_IM2COL_F32_6X16_single_thread
)
{
constexpr
size_t
RUNS
=
50
;
param
::
ConvBias
param
;
param
.
nonlineMode
=
param
::
ConvBias
::
NonlineMode
::
RELU
;
param
.
pad_h
=
1
;
param
.
pad_w
=
1
;
param
.
stride_h
=
1
;
param
.
stride_w
=
1
;
std
::
vector
<
DType
>
data_type
=
{
dtype
::
Float32
(),
dtype
::
Float32
(),
dtype
::
Float32
(),
dtype
::
Float32
()};
std
::
vector
<
std
::
pair
<
SmallVector
<
TensorShape
>
,
float
>>
shapes_and_computation
;
auto
bench_case
=
[
&
](
size_t
N
,
size_t
IC
,
size_t
OC
,
size_t
H
,
size_t
W
,
size_t
FS
,
size_t
group
)
{
SmallVector
<
TensorShape
>
shapes
{
{
N
,
IC
,
H
,
W
},
{
OC
/
group
,
IC
/
group
,
FS
,
FS
},
{
1
,
OC
,
1
,
1
},
{},
{
N
,
OC
,
H
,
W
}};
TensorShape
dst
{
N
,
OC
,
H
,
W
};
float
computations
=
((
IC
/
group
)
*
FS
*
FS
*
dst
.
total_nr_elems
()
*
2
+
dst
.
total_nr_elems
())
*
1e-6
;
shapes_and_computation
.
push_back
(
std
::
make_pair
(
shapes
,
computations
));
};
bench_case
(
1
,
32
,
32
,
200
,
200
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
200
,
200
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
128
,
128
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
128
,
128
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
100
,
100
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
100
,
100
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
80
,
80
,
3
,
1
);
bench_case
(
1
,
32
,
32
,
80
,
80
,
3
,
1
);
bench_case
(
1
,
64
,
32
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
64
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
128
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
256
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
512
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
1024
,
7
,
7
,
3
,
1
);
bench_case
(
1
,
64
,
32
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
64
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
128
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
256
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
512
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
64
,
1024
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
128
,
128
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
128
,
256
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
512
,
512
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
256
,
512
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
512
,
1024
,
14
,
14
,
3
,
1
);
bench_case
(
1
,
1024
,
1024
,
14
,
14
,
3
,
1
);
std
::
string
algo_name
=
"IM2COLMATMUL:X86_F32_MKL_PACKA:192"
;
std
::
string
algo_name1
=
"IM2COLMATMUL:X86_F32_6x16:192"
;
printf
(
"Benchmark IM2COLMATMUL:X86_F32_6x16 algo
\n
"
);
benchmark_impl_comp
(
param
,
shapes_and_computation
,
algo_name
,
algo_name1
,
RUNS
,
{
1
,
{
4
}},
{
1
,
{
4
}},
data_type
);
benchmark_impl_comp
(
param
,
shapes_and_computation
,
algo_name
,
algo_name1
,
RUNS
,
{
1
,
{
7
}},
{
1
,
{
7
}},
data_type
);
shapes_and_computation
.
clear
();
}
TEST_F
(
X86_BENCHMARK_MULTI_THREADS
,
BENCHMARK_CONVBIAS_IM2COL_INT8X8X32
)
{
constexpr
size_t
RUNS
=
50
;
...
...
dnn/test/x86/matrix_mul.cpp
浏览文件 @
b0ba6d32
...
...
@@ -83,6 +83,12 @@ TEST_F(X86, MATRIX_MUL_AVX2_MK8_8X8) {
"X86_F32MK8_8X8"
,
param
::
MatrixMul
::
Format
::
MK8
,
1
,
1e-3
,
false
);
}
TEST_F
(
X86
,
MATRIX_MUL_AVX2_6x16
)
{
matrix_mul
::
check_matrix_mul
(
dtype
::
Float32
{},
dtype
::
Float32
{},
dtype
::
Float32
{},
handle
(),
"X86_F32_6x16"
,
param
::
MatrixMul
::
Format
::
DEFAULT
,
1
,
1e-3
,
false
);
}
#if MEGDNN_WITH_BENCHMARK
TEST_F
(
X86
,
BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8
)
{
...
...
@@ -93,6 +99,14 @@ TEST_F(X86, BENCHMARK_MATRIX_MUL_AVX2_MK8_8X8) {
dtype
::
Float32
{},
dtype
::
Float32
{},
"X86_F32_BLAS"
);
}
TEST_F
(
X86
,
BENCHMARK_MATRIX_MUL_AVX2_6x16
)
{
auto
args
=
matrix_mul
::
get_benchmark_matmul_mk_packed_args
(
8
);
matrix_mul
::
benchmark_with_contrast
(
handle
(),
args
,
dtype
::
Float32
{},
dtype
::
Float32
{},
dtype
::
Float32
{},
"X86_F32_6x16"
,
param
::
MatrixMul
::
Format
::
DEFAULT
,
dtype
::
Float32
{},
dtype
::
Float32
{},
dtype
::
Float32
{},
"X86_F32_BLAS"
);
}
TEST_F
(
X86
,
BENCHMARK_MATRIX_MUL_8X8X32
)
{
constexpr
size_t
RUNS
=
50
;
auto
rng
=
std
::
make_unique
<
UniformIntRNG
>
(
-
127
,
127
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录