Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
217999b1
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
217999b1
编写于
8月 15, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(arm): add winograd F43 NCHW44 algo and winograd F43 44 algo
GitOrigin-RevId: a981b2f61b123d6d5b476e8316c8bb83f0367647
上级
f0f6f5fe
变更
12
展开全部
显示空白变更内容
内联
并排
Showing
12 changed file
with
1762 addition
and
42 deletion
+1762
-42
dnn/src/common/unroll_macro.h
dnn/src/common/unroll_macro.h
+9
-0
dnn/src/fallback/conv_bias/gi/fp32/algos.cpp
dnn/src/fallback/conv_bias/gi/fp32/algos.cpp
+78
-0
dnn/src/fallback/conv_bias/gi/fp32/algos.h
dnn/src/fallback/conv_bias/gi/fp32/algos.h
+35
-0
dnn/src/fallback/conv_bias/gi/fp32/strategy.h
dnn/src/fallback/conv_bias/gi/fp32/strategy.h
+5
-0
dnn/src/fallback/conv_bias/gi/fp32/strategy_4x3_4x4.cpp
dnn/src/fallback/conv_bias/gi/fp32/strategy_4x3_4x4.cpp
+340
-0
dnn/src/fallback/conv_bias/gi/fp32/strategy_f43_mk4_nchw44.cpp
...rc/fallback/conv_bias/gi/fp32/strategy_f43_mk4_nchw44.cpp
+1181
-0
dnn/src/fallback/conv_bias/opr_impl.cpp
dnn/src/fallback/conv_bias/opr_impl.cpp
+9
-1
dnn/src/fallback/conv_bias/opr_impl.h
dnn/src/fallback/conv_bias/opr_impl.h
+4
-0
dnn/test/arm_common/conv_bias.cpp
dnn/test/arm_common/conv_bias.cpp
+21
-0
dnn/test/common/conv_bias.cpp
dnn/test/common/conv_bias.cpp
+57
-37
dnn/test/common/conv_bias.h
dnn/test/common/conv_bias.h
+4
-4
dnn/test/fallback/conv_bias.cpp
dnn/test/fallback/conv_bias.cpp
+19
-0
未找到文件。
dnn/src/common/unroll_macro.h
浏览文件 @
217999b1
...
@@ -177,6 +177,15 @@
...
@@ -177,6 +177,15 @@
UNROLL_RAW_5x2(cb, v0, ##a) \
UNROLL_RAW_5x2(cb, v0, ##a) \
cb(5, 0, ##a) cb(5, 1, ##a)
cb(5, 0, ##a) cb(5, 1, ##a)
#define UNROLL_RAW_4x6(cb, v0, a...) \
cb(0, 0, ##a) cb(0, 1, ##a) cb(0, 2, ##a) cb(0, 3, ##a) cb(0, 4, ##a) cb(0, 5, ##a) \
cb(1, 0, ##a) cb(1, 1, ##a) cb(1, 2, ##a) cb(1, 3, ##a) cb(1, 4, ##a) cb(1, 5, ##a) \
cb(2, 0, ##a) cb(2, 1, ##a) cb(2, 2, ##a) cb(2, 3, ##a) cb(2, 4, ##a) cb(2, 5, ##a) \
cb(3, 0, ##a) cb(3, 1, ##a) cb(3, 2, ##a) cb(3, 3, ##a) cb(3, 4, ##a) cb(3, 5, ##a)
#define UNROLL_RAW_5x6(cb, v0, a...) \
UNROLL_RAW_4x6(cb, v0, ##a) \
cb(4, 0, ##a) cb(4, 1, ##a) cb(4, 2, ##a) cb(4, 3, ##a) cb(4, 4, ##a) cb(4, 5, ##a)
#define UNROLL_CALL0_D2(step, step2, cb, v...) \
#define UNROLL_CALL0_D2(step, step2, cb, v...) \
UNROLL_RAW_##step##x##step2(cb, 0, ##v)
UNROLL_RAW_##step##x##step2(cb, 0, ##v)
#define UNROLL_CALL1_D2(step, step2, cb, v...) \
#define UNROLL_CALL1_D2(step, step2, cb, v...) \
...
...
dnn/src/fallback/conv_bias/gi/fp32/algos.cpp
浏览文件 @
217999b1
...
@@ -218,6 +218,44 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
...
@@ -218,6 +218,44 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF63_4x4
,
winograd
::
winograd_6x3_4x4_f
,
AlgoFP32WinogradF63_4x4
,
winograd
::
winograd_6x3_4x4_f
,
megdnn_fallback_winograd_fp32
,
param
::
MatrixMul
::
Format
::
MK4
);
megdnn_fallback_winograd_fp32
,
param
::
MatrixMul
::
Format
::
MK4
);
/* ======================= AlgoFP32WinogradF43_4x4 ======================== */
bool
ConvBiasImpl
::
AlgoFP32WinogradF43_4x4
::
usable
(
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
/*algo_selection_strategy*/
)
const
{
MEGDNN_MARK_USED_VAR
(
param
);
MIDOUT_BEGIN
(
megdnn_fallback_winograd_fp32
,
6
,
0
)
{
if
(
param
.
filter_meta
.
icpg
%
4
!=
0
||
param
.
filter_meta
.
ocpg
%
4
!=
0
)
return
false
;
using
Strategy
=
winograd
::
winograd_4x3_4x4_f
;
using
PackMode
=
fallback
::
MatrixMulImpl
::
AlgoBase
::
PackMode
;
Strategy
strategy
(
param
.
src_type
,
param
.
filter_type
,
param
.
dst_type
);
auto
&&
matmul_param
=
megdnn
::
winograd
::
ConvBias
<
Strategy
,
param
::
MatrixMul
::
Format
::
MK4
>
(
strategy
,
m_tile_size
,
param
)
.
get_matmul_kern_param
(
param
);
return
m_matmul_algo
->
usable
(
matmul_param
)
&&
m_matmul_algo
->
packmode
()
==
PackMode
::
NO_PACK
&&
param
.
filter_meta
.
format
==
param
::
ConvBias
::
Format
::
NCHW
&&
!
param
.
filter_meta
.
should_flip
&&
(
param
.
filter_meta
.
spatial
[
0
]
==
param
.
filter_meta
.
spatial
[
1
]
&&
param
.
filter_meta
.
spatial
[
0
]
==
3
)
&&
(
param
.
filter_meta
.
stride
[
0
]
==
param
.
filter_meta
.
stride
[
1
]
&&
param
.
filter_meta
.
stride
[
0
]
==
1
)
&&
(
param
.
filter_meta
.
dilation
[
0
]
==
param
.
filter_meta
.
dilation
[
1
]
&&
param
.
filter_meta
.
dilation
[
0
]
==
1
)
&&
param
.
compute_mode
==
param
::
ConvBias
::
ComputeMode
::
DEFAULT
&&
param
.
src_type
.
enumv
()
==
DTypeEnum
::
Float32
&&
param
.
filter_meta
.
icpg
%
4
==
0
&&
param
.
filter_meta
.
ocpg
%
4
==
0
;
}
MIDOUT_END
();
return
false
;
}
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL
(
AlgoFP32WinogradF43_4x4
,
winograd
::
winograd_4x3_4x4_f
,
megdnn_fallback_winograd_fp32
,
param
::
MatrixMul
::
Format
::
MK4
);
/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */
/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */
bool
ConvBiasImpl
::
AlgoFP32WinogradF23_4x4_NCHW44
::
usable
(
bool
ConvBiasImpl
::
AlgoFP32WinogradF23_4x4_NCHW44
::
usable
(
...
@@ -297,6 +335,46 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
...
@@ -297,6 +335,46 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL(
AlgoFP32WinogradF63_4x4_NCHW44
,
winograd
::
winograd_F63_mk4_f_nchw44
,
AlgoFP32WinogradF63_4x4_NCHW44
,
winograd
::
winograd_F63_mk4_f_nchw44
,
megdnn_fallback_winograd_fp32
,
param
::
MatrixMul
::
Format
::
MK4
);
megdnn_fallback_winograd_fp32
,
param
::
MatrixMul
::
Format
::
MK4
);
/* =================== AlgoFP32WinogradF43_4x4_NCHW44 ===================== */
bool
ConvBiasImpl
::
AlgoFP32WinogradF43_4x4_NCHW44
::
usable
(
const
NCBKernSizeParam
&
param
,
AlgoSelectionStrategy
/*algo_selection_strategy*/
)
const
{
MEGDNN_MARK_USED_VAR
(
param
);
MIDOUT_BEGIN
(
megdnn_fallback_winograd_fp32
,
midout_iv
(
"AlgoFP32WinogradF43_4x4_NCHW44"
_hash
))
{
if
(
param
.
filter_meta
.
icpg
%
4
!=
0
||
param
.
filter_meta
.
ocpg
%
4
!=
0
)
return
false
;
using
Strategy
=
winograd
::
winograd_F43_mk4_f_nchw44
;
Strategy
strategy
(
param
.
src_type
,
param
.
filter_type
,
param
.
dst_type
);
auto
&&
matmul_param
=
megdnn
::
winograd
::
ConvBias
<
Strategy
,
param
::
MatrixMul
::
Format
::
MK4
>
(
strategy
,
m_tile_size
,
param
)
.
get_matmul_kern_param
(
param
);
return
m_matmul_algo
->
usable
(
matmul_param
)
&&
m_matmul_algo
->
packmode
()
==
fallback
::
MatrixMulImpl
::
AlgoBase
::
PackMode
::
NO_PACK
&&
param
.
filter_meta
.
format
==
param
::
ConvBias
::
Format
::
NCHW44
&&
!
param
.
filter_meta
.
should_flip
&&
(
param
.
filter_meta
.
spatial
[
0
]
==
param
.
filter_meta
.
spatial
[
1
]
&&
param
.
filter_meta
.
spatial
[
0
]
==
3
)
&&
(
param
.
filter_meta
.
stride
[
0
]
==
param
.
filter_meta
.
stride
[
1
]
&&
param
.
filter_meta
.
stride
[
0
]
==
1
)
&&
(
param
.
filter_meta
.
dilation
[
0
]
==
param
.
filter_meta
.
dilation
[
1
]
&&
param
.
filter_meta
.
dilation
[
0
]
==
1
)
&&
param
.
compute_mode
==
param
::
ConvBias
::
ComputeMode
::
DEFAULT
&&
param
.
src_type
.
enumv
()
==
DTypeEnum
::
Float32
&&
param
.
filter_meta
.
icpg
%
4
==
0
&&
param
.
filter_meta
.
ocpg
%
4
==
0
;
}
MIDOUT_END
();
return
false
;
}
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL
(
AlgoFP32WinogradF43_4x4_NCHW44
,
winograd
::
winograd_F43_mk4_f_nchw44
,
megdnn_fallback_winograd_fp32
,
param
::
MatrixMul
::
Format
::
MK4
);
/* =================== AlgoFP32WinogradF73_4x4_NCHW44 ===================== */
/* =================== AlgoFP32WinogradF73_4x4_NCHW44 ===================== */
bool
ConvBiasImpl
::
AlgoFP32WinogradF73_4x4_NCHW44
::
usable
(
bool
ConvBiasImpl
::
AlgoFP32WinogradF73_4x4_NCHW44
::
usable
(
...
...
dnn/src/fallback/conv_bias/gi/fp32/algos.h
浏览文件 @
217999b1
...
@@ -81,6 +81,23 @@ public:
...
@@ -81,6 +81,23 @@ public:
MEGDNN_DECL_ALGO_TYPE
(
GI_COMMON_WINOGRAD_F63_4X4_FP32
)
MEGDNN_DECL_ALGO_TYPE
(
GI_COMMON_WINOGRAD_F63_4X4_FP32
)
};
};
class
ConvBiasImpl
::
AlgoFP32WinogradF43_4x4
final
:
public
AlgoBase
{
public:
AlgoFP32WinogradF43_4x4
(
fallback
::
MatrixMulImpl
::
AlgoBase
*
matmul_algo
,
uint32_t
tile_size
)
:
m_matmul_algo
{
matmul_algo
},
m_tile_size
{
tile_size
}
{}
const
char
*
name
()
const
override
{
if
(
m_name
.
empty
())
{
m_name
=
ConvBiasImpl
::
algo_name
<
ConvBias
::
WinogradParam
>
(
m_matmul_algo
->
name
(),
{
4
,
4
,
m_tile_size
,
3
});
}
return
m_name
.
c_str
();
}
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
;
}
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE
(
AlgoDataType
::
FLOAT32
);
MEGDNN_DECL_ALGO_TYPE
(
GI_COMMON_WINOGRAD_F43_4X4_FP32
)
};
class
ConvBiasImpl
::
AlgoFP32WinogradF54
final
:
public
AlgoBase
{
class
ConvBiasImpl
::
AlgoFP32WinogradF54
final
:
public
AlgoBase
{
public:
public:
AlgoFP32WinogradF54
(
AlgoFP32WinogradF54
(
...
@@ -156,6 +173,24 @@ public:
...
@@ -156,6 +173,24 @@ public:
MEGDNN_DECL_ALGO_TYPE
(
GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32
)
MEGDNN_DECL_ALGO_TYPE
(
GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32
)
};
};
class
ConvBiasImpl
::
AlgoFP32WinogradF43_4x4_NCHW44
final
:
public
AlgoBase
{
public:
AlgoFP32WinogradF43_4x4_NCHW44
(
fallback
::
MatrixMulImpl
::
AlgoBase
*
matmul_algo
,
uint32_t
tile_size
)
:
m_matmul_algo
{
matmul_algo
},
m_tile_size
{
tile_size
}
{}
const
char
*
name
()
const
override
{
if
(
m_name
.
empty
())
{
m_name
=
ConvBiasImpl
::
algo_name
<
ConvBias
::
WinogradParam
>
(
m_matmul_algo
->
name
(),
{
4
,
4
,
m_tile_size
,
3
},
param
::
ConvBias
::
Format
::
NCHW44
);
}
return
m_name
.
c_str
();
}
AlgoAttribute
attribute
()
const
override
{
return
AlgoAttribute
::
REPRODUCIBLE
;
}
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE
(
AlgoDataType
::
FLOAT32
);
MEGDNN_DECL_ALGO_TYPE
(
GI_COMMON_WINOGRAD_F43_4X4_NCHW44_F32
)
};
class
ConvBiasImpl
::
AlgoFP32WinogradF73_4x4_NCHW44
final
:
public
AlgoBase
{
class
ConvBiasImpl
::
AlgoFP32WinogradF73_4x4_NCHW44
final
:
public
AlgoBase
{
public:
public:
AlgoFP32WinogradF73_4x4_NCHW44
(
AlgoFP32WinogradF73_4x4_NCHW44
(
...
...
dnn/src/fallback/conv_bias/gi/fp32/strategy.h
浏览文件 @
217999b1
...
@@ -16,6 +16,8 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 3, 1, 1, winograd_4x
...
@@ -16,6 +16,8 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 3, 1, 1, winograd_4x
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
6
,
3
,
4
,
4
,
winograd_6x3_4x4_f
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
6
,
3
,
4
,
4
,
winograd_6x3_4x4_f
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
4
,
3
,
4
,
4
,
winograd_4x3_4x4_f
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
5
,
4
,
1
,
1
,
winograd_5x4_1x1_f
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
5
,
4
,
1
,
1
,
winograd_5x4_1x1_f
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
4
,
5
,
1
,
1
,
winograd_4x5_1x1_f
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
4
,
5
,
1
,
1
,
winograd_4x5_1x1_f
)
...
@@ -26,6 +28,9 @@ MEGDNN_REG_WINOGRAD_STRATEGY(
...
@@ -26,6 +28,9 @@ MEGDNN_REG_WINOGRAD_STRATEGY(
MEGDNN_REG_WINOGRAD_STRATEGY
(
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
6
,
3
,
4
,
4
,
winograd_F63_mk4_f_nchw44
)
float
,
float
,
float
,
float
,
6
,
3
,
4
,
4
,
winograd_F63_mk4_f_nchw44
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
4
,
3
,
4
,
4
,
winograd_F43_mk4_f_nchw44
)
MEGDNN_REG_WINOGRAD_STRATEGY
(
MEGDNN_REG_WINOGRAD_STRATEGY
(
float
,
float
,
float
,
float
,
7
,
3
,
4
,
4
,
winograd_F73_mk4_f_nchw44
)
float
,
float
,
float
,
float
,
7
,
3
,
4
,
4
,
winograd_F73_mk4_f_nchw44
)
}
// namespace winograd
}
// namespace winograd
...
...
dnn/src/fallback/conv_bias/gi/fp32/strategy_4x3_4x4.cpp
0 → 100644
浏览文件 @
217999b1
#include "src/common/unroll_macro.h"
#include "src/common/utils.h"
#include "src/common/winograd/winograd_helper.h"
#include "src/fallback/conv_bias/gi/fp32/filter_transform.h"
#include "src/fallback/conv_bias/gi/fp32/helper.h"
#include "src/fallback/conv_bias/gi/fp32/strategy.h"
#include "src/fallback/conv_bias/winograd/winograd.h"
#include "src/fallback/elemwise_helper/op_unary.h"
#include "midout.h"
MIDOUT_DECL
(
megdnn_fallback_winograd_fp32_F43_4x4
)
using
namespace
megdnn
;
using
namespace
fallback
;
namespace
{
#define MLAF GiMultiplyAddScalarFloat32
#define MLSF GiMultiplySubScalarFloat32
struct
InputTransform4X3
{
/**
* @brief Convert layout from NCHW to NCHW44(i.e. NC4HW4)
*
* @tparam inner Whether all data in [[ih_start, ih_start+6), [iw_start,
* iw_start+6)] is in @input
* @param input Pointer which points to all input data(CHW, exclude dim N)
* @param patch Buffer which size is sizeof(float) * 4 * 6 * 6. Continuous storage
* of data for the current block, order by C, H, W.
* @param patchT RETURN
* @param ih_start The start index of dim H of current block
* @param iw_start The start index of dim W of current block
* @param IH Dim H of input
* @param IW Dim W of input
* @param ic The index of dim C of input
* @param IC Dim C of input
*/
template
<
bool
inner
>
static
void
transpose
(
const
float
*
input
,
float
*
patch
,
float
*
patchT
,
int
ih_start
,
int
iw_start
,
size_t
IH
,
size_t
IW
,
size_t
ic
,
size_t
IC
)
{
constexpr
size_t
alpha
=
4
+
3
-
1
;
if
(
!
inner
||
ic
+
4
>
IC
)
{
memset
(
patch
,
0
,
sizeof
(
float
)
*
4
*
alpha
*
alpha
);
}
if
(
inner
)
{
const
float
*
input_ptr
=
input
+
ic
*
IH
*
IW
+
ih_start
*
IW
+
iw_start
;
for
(
size_t
ico
=
0
;
ico
<
4
;
++
ico
)
{
if
(
ic
+
ico
<
IC
)
{
#define cb(i) \
auto v##i##0 = GiLoadFloat32(input_ptr + i * IW); \
GiStoreFloat32(patch + ico * alpha * alpha + i * alpha, v##i##0); \
auto v##i##1 = GiLoadFloat32LowHalf(input_ptr + i * IW + 4); \
GiStoreFloat32(patch + ico * alpha * alpha + i * alpha + 4, v##i##1);
UNROLL_CALL_NOWRAPPER
(
6
,
cb
);
#undef cb
input_ptr
+=
IH
*
IW
;
}
}
}
else
{
size_t
ih0
=
std
::
max
(
0
,
ih_start
),
ih1
=
std
::
min
(
ih_start
+
alpha
,
IH
),
iw0
=
std
::
max
(
0
,
iw_start
),
iw1
=
std
::
min
(
iw_start
+
alpha
,
IW
);
for
(
size_t
ico
=
0
;
ico
<
4
&&
ic
+
ico
<
IC
;
++
ico
)
{
for
(
size_t
ih
=
ih0
;
ih
<
ih1
;
++
ih
)
{
for
(
size_t
iw
=
iw0
;
iw
<
iw1
;
++
iw
)
{
patch
[
ico
*
alpha
*
alpha
+
(
ih
-
ih_start
)
*
alpha
+
(
iw
-
iw_start
)]
=
input
[(
ic
+
ico
)
*
IH
*
IW
+
ih
*
IW
+
iw
];
}
}
}
}
#define cb(i) transpose_4x4(patch + i * 4, patchT + i * 16, 36, 4);
UNROLL_CALL_NOWRAPPER
(
9
,
cb
);
#undef cb
}
static
void
transform
(
const
float
*
patchT
,
float
*
input_transform_buf
,
size_t
unit_idx
,
size_t
nr_units_in_tile
,
size_t
ic
,
size_t
IC
)
{
constexpr
size_t
alpha
=
4
+
3
-
1
;
#define cb(m, n) \
GI_FLOAT32_t d##m##n = GiLoadFloat32(patchT + m * alpha * 4 + n * 4), wd##m##n;
UNROLL_CALL_NOWRAPPER_D2
(
6
,
6
,
cb
);
#undef cb
//! BT
//! 4 0 -5 0 1 0
//! 0 -4 -4 1 1 0
//! 0 4 -4 -1 1 0
//! 0 -2 -1 2 1 0
//! 0 2 -1 -2 1 0
//! 0 4 0 -5 0 1
//! wd0n = 4 * (d0n - d2n) + (d4n - d2n)
//! wd1n = (d3n + d4n) - 4 * (d1n + d2n)
//! wd2n = 4 * (d1n - d2n) + (d4n - d3n)
//! wd3n = (d4n - d2n) - 2 * (d1n - d3n)
//! wd4n = 2 * (d1n - d3n) + (d4n - d2n)
//! wd5n = 4 * (d1n - d3n) + (d5n - d3n)
#define cb(n) \
{ \
auto&& d4subd2 = SUBF(d4##n, d2##n); \
auto&& d1subd3 = SUBF(d1##n, d3##n); \
wd0##n = MLAF(d4subd2, SUBF(d0##n, d2##n), 4.0f); \
wd1##n = MLSF(ADDF(d3##n, d4##n), ADDF(d1##n, d2##n), 4.0f); \
wd2##n = MLAF(SUBF(d4##n, d3##n), SUBF(d1##n, d2##n), 4.0f); \
auto&& double_d1subd3 = MULSF(d1subd3, 2.0f); \
wd3##n = SUBF(d4subd2, double_d1subd3); \
wd4##n = ADDF(double_d1subd3, d4subd2); \
wd5##n = MLAF(SUBF(d5##n, d3##n), d1subd3, 4.0f); \
}
UNROLL_CALL_NOWRAPPER
(
6
,
cb
);
#undef cb
//! B
//! 4 0 0 0 0 0
//! 0 -4 4 -2 2 4
//! -5 -4 -4 -1 -1 0
//! 0 1 -1 2 -2 -5
//! 1 1 1 1 1 0
//! 0 0 0 0 0 1
//! dm0 = 4 * (wdm0 - wdm2) + (wdm4 - wdm2)
//! dm1 = (wdm3 + wdm4) - 4 * (wdm1 + wdm2)
//! dm2 = 4 * (wdm1 - wdm2) + (wdm4 - wdm3)
//! dm3 = (wdm4 - wdm2) - 2 * (wdm1 - wdm3)
//! dm4 = 2 * (wdm1 - wdm3) + (wdm4 - wdm2)
//! dm5 = 4 * (wdm1 - wdm3) + (wdm5 - wdm3)
#define cb(m) \
{ \
auto&& wd4subwd2 = SUBF(wd##m##4, wd##m##2); \
auto&& wd1subwd3 = SUBF(wd##m##1, wd##m##3); \
d##m##0 = MLAF(wd4subwd2, SUBF(wd##m##0, wd##m##2), 4.0f); \
d##m##1 = MLSF(ADDF(wd##m##3, wd##m##4), ADDF(wd##m##1, wd##m##2), 4.0f); \
d##m##2 = MLAF(SUBF(wd##m##4, wd##m##3), SUBF(wd##m##1, wd##m##2), 4.0f); \
auto&& double_wd1subwd3 = MULSF(wd1subwd3, 2.0f); \
d##m##3 = SUBF(wd4subwd2, double_wd1subwd3); \
d##m##4 = ADDF(double_wd1subwd3, wd4subwd2); \
d##m##5 = MLAF(SUBF(wd##m##5, wd##m##3), wd1subwd3, 4.0f); \
}
UNROLL_CALL_NOWRAPPER
(
6
,
cb
);
#undef cb
size_t
ICB
=
IC
/
4
;
size_t
icb
=
ic
/
4
;
#define cb(m, n) \
GiStoreFloat32( \
input_transform_buf + (m * alpha + n) * ICB * 4 * nr_units_in_tile + \
icb * nr_units_in_tile * 4 + unit_idx * 4, \
d##m##n);
UNROLL_CALL_NOWRAPPER_D2
(
6
,
6
,
cb
);
#undef cb
}
};
// InputTransform4X3
template
<
BiasMode
bmode
,
typename
Op
>
struct
OutputTransform4X3
{
static
void
transform
(
const
float
*
output_transform_buf
,
const
float
*
bias
,
float
*
output
,
float
*
transform_mid_buf
,
size_t
oh_start
,
size_t
ow_start
,
size_t
OH
,
size_t
OW
,
size_t
oc_start
,
size_t
oc_end
,
size_t
oc_index
,
size_t
unit_idx
,
size_t
nr_units_in_tile
,
const
DType
&
src_dtype
,
const
DType
&
dst_dtype
)
{
Op
op
(
src_dtype
,
dst_dtype
);
constexpr
size_t
alpha
=
4
+
3
-
1
;
size_t
oc
=
oc_start
+
oc_index
;
size_t
OCB
=
(
oc_end
-
oc_start
)
/
4
;
size_t
ocb
=
oc_index
/
4
;
#define cb(m, n) \
auto v##m##n = GiLoadFloat32( \
output_transform_buf + (m * alpha + n) * OCB * nr_units_in_tile * 4 + \
ocb * nr_units_in_tile * 4 + unit_idx * 4);
UNROLL_CALL_NOWRAPPER_D2
(
6
,
6
,
cb
);
#undef cb
//! AT
//! 1 1 1 1 1 0
//! 0 1 -1 2 -2 0
//! 0 1 1 4 4 0
//! 0 1 -1 8 -8 1
//! t0n = v0n + (v1n + v2n) + (v3n + v4n)
//! t1n = (v1n - v2n) + 2 * (v3n - v4n)
//! t2n = (v1n + v2n) + 4 * (v3n + v4n)
//! t3n = (v1n - v2n) + 8 * (v3n - v4n) + v5n
#define cb(m, n) GI_FLOAT32_t t##m##n;
UNROLL_CALL_NOWRAPPER_D2
(
4
,
6
,
cb
);
#undef cb
#define cb(n) \
{ \
auto&& v1addv2 = ADDF(v1##n, v2##n); \
auto&& v1subv2 = SUBF(v1##n, v2##n); \
auto&& v3addv4 = ADDF(v3##n, v4##n); \
auto&& v3subv4 = SUBF(v3##n, v4##n); \
\
t0##n = ADDF(ADDF(v0##n, v1addv2), v3addv4); \
t1##n = MLAF(v1subv2, v3subv4, 2.0f); \
t2##n = MLAF(v1addv2, v3addv4, 4.0f); \
t3##n = ADDF(MLAF(v1subv2, v3subv4, 8.0f), v5##n); \
}
UNROLL_CALL_NOWRAPPER
(
6
,
cb
);
#undef cb
//! A
//! 1 0 0 0
//! 1 1 1 1
//! 1 -1 1 -1
//! 1 2 4 8
//! 1 -2 4 -8
//! 0 0 0 1
// vm0 = tm0 + (tm1 + tm2) + (tm3 + tm4)
// vm1 = (tm1 - tm2) + 2 * (tm3 - tm4)
// vm2 = (tm1 + tm2) + 4 * (tm3 + tm4)
// vm3 = (tm1 - tm2) + 8 * (tm3 - tm4) + tm5
#define cb(m) \
{ \
auto&& t1addt2 = ADDF(t##m##1, t##m##2); \
auto&& t1subt2 = SUBF(t##m##1, t##m##2); \
auto&& t3addt4 = ADDF(t##m##3, t##m##4); \
auto&& t3subt4 = SUBF(t##m##3, t##m##4); \
v##m##0 = ADDF(ADDF(t##m##0, t1addt2), t3addt4); \
v##m##1 = MLAF(t1subt2, t3subt4, 2.0f); \
v##m##2 = MLAF(t1addt2, t3addt4, 4.0f); \
v##m##3 = ADDF(MLAF(t1subt2, t3subt4, 8.0f), t##m##5); \
}
UNROLL_CALL_NOWRAPPER
(
4
,
cb
);
#undef cb
GI_FLOAT32_t
vbias
;
if
(
bmode
==
BiasMode
::
BROADCAST_CHANNEL_BIAS
)
{
vbias
=
GiLoadFloat32
(
bias
+
oc
);
#define cb(m, n) v##m##n = GiAddFloat32(v##m##n, vbias);
UNROLL_CALL_NOWRAPPER_D2
(
4
,
4
,
cb
);
#undef cb
}
if
(
bmode
!=
BiasMode
::
BIAS
)
{
#define cb(m, n) v##m##n = op(v##m##n);
UNROLL_CALL_NOWRAPPER_D2
(
4
,
4
,
cb
);
#undef cb
}
#define cb(m, n) GiStoreFloat32(transform_mid_buf + (4 * m + n) * 4, v##m##n);
UNROLL_CALL_NOWRAPPER_D2
(
4
,
4
,
cb
);
#undef cb
for
(
size_t
oho
=
0
;
oho
<
4
&&
oh_start
+
oho
<
OH
;
++
oho
)
{
for
(
size_t
owo
=
0
;
owo
<
4
&&
ow_start
+
owo
<
OW
;
++
owo
)
{
for
(
size_t
oco
=
0
;
oco
<
4
&&
oc
+
oco
<
oc_end
;
++
oco
)
{
float
res
=
transform_mid_buf
[
oho
*
4
*
4
+
owo
*
4
+
oco
];
size_t
oh
=
oh_start
+
oho
;
size_t
ow
=
ow_start
+
owo
;
if
(
bmode
==
BiasMode
::
BIAS
)
{
res
+=
bias
[(
oc
+
oco
)
*
OH
*
OW
+
oh
*
OW
+
ow
];
res
=
op
(
res
);
}
output
[(
oc
+
oco
)
*
OH
*
OW
+
oh
*
OW
+
ow
]
=
res
;
}
}
}
}
};
// OutputTransform4X3
#undef MLSF
#undef MLAF
}
// namespace
namespace
megdnn
{
namespace
fallback
{
namespace
winograd
{
MEGDNN_REG_WINOGRAD_STRATEGY_IMPL
(
winograd_4x3_4x4_f
)
void
winograd_4x3_4x4_f
::
filter
(
const
float
*
filter
,
float
*
filter_transform_buf
,
float
*
transform_mid_buf
,
size_t
OC
,
size_t
IC
,
size_t
oc_start
,
size_t
oc_end
)
{
FilterTransform4X3
<
megdnn
::
param
::
MatrixMul
::
Format
::
MK4
>::
transform
(
filter
,
filter_transform_buf
,
transform_mid_buf
,
OC
,
IC
,
oc_start
,
oc_end
);
}
void
winograd_4x3_4x4_f
::
input
(
const
float
*
input
,
float
*
input_transform_buf
,
float
*
transform_mid_buf
,
size_t
IH
,
size_t
IW
,
size_t
IC
,
size_t
PH
,
size_t
PW
,
size_t
unit_start_idx
,
size_t
nr_units_in_tile
)
{
megdnn_assert
(
IC
%
4
==
0
);
auto
unit_w
=
div_ceil
<
size_t
>
(
IW
+
2
*
PW
-
KERNEL_SIZE
+
1
,
OUTPUT_BLOCK_SIZE
);
float
*
patch
=
transform_mid_buf
;
float
*
patchT
=
transform_mid_buf
+
4
*
ALPHA
*
ALPHA
;
for
(
size_t
ic
=
0
;
ic
<
IC
;
ic
+=
4
)
{
for
(
size_t
unit_idx
=
0
;
unit_idx
<
nr_units_in_tile
;
++
unit_idx
)
{
size_t
index
=
unit_start_idx
+
unit_idx
;
size_t
oht
=
index
/
unit_w
;
size_t
owt
=
index
%
unit_w
;
int
ih_start
=
static_cast
<
int
>
(
oht
*
OUTPUT_BLOCK_SIZE
-
PH
);
int
iw_start
=
static_cast
<
int
>
(
owt
*
OUTPUT_BLOCK_SIZE
-
PW
);
if
(
ih_start
>=
0
&&
ih_start
+
6
<=
static_cast
<
int
>
(
IH
)
&&
iw_start
>=
0
&&
iw_start
+
6
<=
static_cast
<
int
>
(
IW
))
{
InputTransform4X3
::
transpose
<
true
>
(
input
,
patch
,
patchT
,
ih_start
,
iw_start
,
IH
,
IW
,
ic
,
IC
);
}
else
{
InputTransform4X3
::
transpose
<
false
>
(
input
,
patch
,
patchT
,
ih_start
,
iw_start
,
IH
,
IW
,
ic
,
IC
);
}
InputTransform4X3
::
transform
(
patchT
,
input_transform_buf
,
unit_idx
,
nr_units_in_tile
,
ic
,
IC
);
}
}
}
void
winograd_4x3_4x4_f
::
output
(
const
float
*
output_transform_buf
,
const
float
*
bias
,
float
*
output
,
float
*
transform_mid_buf
,
BiasMode
bmode
,
NonlineMode
nonline_mode
,
size_t
OH
,
size_t
OW
,
size_t
oc_start
,
size_t
oc_end
,
size_t
unit_start_idx
,
size_t
nr_units_in_tile
)
{
#define cb(_bmode, _nonline_mode, ...) \
OutputTransform4X3<_bmode, _nonline_mode>::transform(__VA_ARGS__);
auto
unit_w
=
div_ceil
<
size_t
>
(
OW
,
OUTPUT_BLOCK_SIZE
);
for
(
size_t
oc
=
oc_start
;
oc
<
oc_end
;
oc
+=
4
)
{
size_t
oc_index
=
oc
-
oc_start
;
for
(
size_t
unit_idx
=
0
;
unit_idx
<
nr_units_in_tile
;
++
unit_idx
)
{
size_t
index
=
unit_idx
+
unit_start_idx
;
size_t
oht
=
index
/
unit_w
;
size_t
owt
=
index
%
unit_w
;
size_t
oh_start
=
oht
*
OUTPUT_BLOCK_SIZE
;
size_t
ow_start
=
owt
*
OUTPUT_BLOCK_SIZE
;
GI_DISPATCH_CONV_WINOGRAD_BIAS
(
megdnn_fallback_winograd_fp32_F43_4x4
,
cb
,
float
,
float
,
bmode
,
nonline_mode
,
output_transform_buf
,
bias
,
output
,
transform_mid_buf
,
oh_start
,
ow_start
,
OH
,
OW
,
oc_start
,
oc_end
,
oc_index
,
unit_idx
,
nr_units_in_tile
,
src_dtype
,
dst_dtype
);
}
}
#undef cb
}
}
// namespace winograd
}
// namespace fallback
}
// namespace megdnn
\ No newline at end of file
dnn/src/fallback/conv_bias/gi/fp32/strategy_f43_mk4_nchw44.cpp
0 → 100644
浏览文件 @
217999b1
此差异已折叠。
点击以展开。
dnn/src/fallback/conv_bias/opr_impl.cpp
浏览文件 @
217999b1
...
@@ -121,7 +121,7 @@ public:
...
@@ -121,7 +121,7 @@ public:
for
(
auto
&&
algo
:
matmul_algos
)
{
for
(
auto
&&
algo
:
matmul_algos
)
{
if
(
is_naive
(
algo
))
if
(
is_naive
(
algo
))
continue
;
continue
;
for
(
uint32_t
tile_size
:
{
16
,
8
,
24
,
32
})
{
for
(
uint32_t
tile_size
:
{
16
,
8
,
24
,
32
,
48
,
68
})
{
refhold
.
emplace_back
(
new
AlgoFP32WinogradF23_4x4
(
refhold
.
emplace_back
(
new
AlgoFP32WinogradF23_4x4
(
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
tile_size
));
tile_size
));
...
@@ -130,10 +130,18 @@ public:
...
@@ -130,10 +130,18 @@ public:
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
tile_size
));
tile_size
));
m_gi_winograd_algos
.
emplace_back
(
refhold
.
back
().
get
());
m_gi_winograd_algos
.
emplace_back
(
refhold
.
back
().
get
());
refhold
.
emplace_back
(
new
AlgoFP32WinogradF43_4x4
(
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
tile_size
));
m_gi_winograd_algos
.
emplace_back
(
refhold
.
back
().
get
());
refhold
.
emplace_back
(
new
AlgoFP32WinogradF63_4x4_NCHW44
(
refhold
.
emplace_back
(
new
AlgoFP32WinogradF63_4x4_NCHW44
(
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
tile_size
));
tile_size
));
m_gi_winograd_algos
.
emplace_back
(
refhold
.
back
().
get
());
m_gi_winograd_algos
.
emplace_back
(
refhold
.
back
().
get
());
refhold
.
emplace_back
(
new
AlgoFP32WinogradF43_4x4_NCHW44
(
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
tile_size
));
m_gi_winograd_algos
.
emplace_back
(
refhold
.
back
().
get
());
refhold
.
emplace_back
(
new
AlgoFP32WinogradF23_4x4_NCHW44
(
refhold
.
emplace_back
(
new
AlgoFP32WinogradF23_4x4_NCHW44
(
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
static_cast
<
fallback
::
MatrixMulImpl
::
AlgoBase
*>
(
algo
),
tile_size
));
tile_size
));
...
...
dnn/src/fallback/conv_bias/opr_impl.h
浏览文件 @
217999b1
...
@@ -219,9 +219,11 @@ public:
...
@@ -219,9 +219,11 @@ public:
GI_COMMON_WINOGRAD_F63_FP32
,
GI_COMMON_WINOGRAD_F63_FP32
,
GI_COMMON_WINOGRAD_F43_FP32
,
GI_COMMON_WINOGRAD_F43_FP32
,
GI_COMMON_WINOGRAD_F63_4X4_FP32
,
GI_COMMON_WINOGRAD_F63_4X4_FP32
,
GI_COMMON_WINOGRAD_F43_4X4_FP32
,
GI_COMMON_WINOGRAD_F54_FP32
,
GI_COMMON_WINOGRAD_F54_FP32
,
GI_COMMON_WINOGRAD_F45_FP32
,
GI_COMMON_WINOGRAD_F45_FP32
,
GI_COMMON_WINOGRAD_F23_4X4_NCHW44_F32
,
GI_COMMON_WINOGRAD_F23_4X4_NCHW44_F32
,
GI_COMMON_WINOGRAD_F43_4X4_NCHW44_F32
,
GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32
,
GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32
,
GI_COMMON_WINOGRAD_F73_4X4_NCHW44_F32
,
GI_COMMON_WINOGRAD_F73_4X4_NCHW44_F32
,
GI_COMMON_DIRECT_FP32
,
GI_COMMON_DIRECT_FP32
,
...
@@ -382,9 +384,11 @@ private:
...
@@ -382,9 +384,11 @@ private:
class
AlgoFP32WinogradF63
;
class
AlgoFP32WinogradF63
;
class
AlgoFP32WinogradF43
;
class
AlgoFP32WinogradF43
;
class
AlgoFP32WinogradF63_4x4
;
class
AlgoFP32WinogradF63_4x4
;
class
AlgoFP32WinogradF43_4x4
;
class
AlgoFP32WinogradF54
;
class
AlgoFP32WinogradF54
;
class
AlgoFP32WinogradF45
;
class
AlgoFP32WinogradF45
;
class
AlgoFP32WinogradF23_4x4_NCHW44
;
class
AlgoFP32WinogradF23_4x4_NCHW44
;
class
AlgoFP32WinogradF43_4x4_NCHW44
;
class
AlgoFP32WinogradF63_4x4_NCHW44
;
class
AlgoFP32WinogradF63_4x4_NCHW44
;
class
AlgoFP32WinogradF73_4x4_NCHW44
;
class
AlgoFP32WinogradF73_4x4_NCHW44
;
...
...
dnn/test/arm_common/conv_bias.cpp
浏览文件 @
217999b1
...
@@ -1013,6 +1013,27 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F43_F63) {
...
@@ -1013,6 +1013,27 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F43_F63) {
handle
(),
3
);
handle
(),
3
);
#endif
#endif
}
}
TEST_F
(
ARM_COMMON
,
BENCHMARK_CONVBIAS_WINOGRAD_44_F43_F23
)
{
#if MEGDNN_AARCH64
benchmark_winograd_compare
(
"WINOGRAD:.*:4:4:.*:3"
,
"WINOGRAD:.*:4:2"
,
handle
(),
3
,
4
);
#endif
}
TEST_F
(
ARM_COMMON
,
BENCHMARK_WINOGRAD_F43_44
)
{
#if MEGDNN_AARCH64
benchmark_winograd_weight_preprocess
(
"WINOGRAD:.*:4:4:.*:3"
,
handle
(),
3
,
4
);
#endif
}
TEST_F
(
ARM_COMMON
,
BENCHMARK_WINOGRAD_F43_NCHW44
)
{
#if MEGDNN_AARCH64
benchmark_winograd_weight_preprocess
(
"WINOGRAD_NCHW44:.*:4:4:.*:3"
,
handle
(),
3
,
4
,
4
);
#endif
}
TEST_F
(
ARM_COMMON
,
BENCHMARK_CONVBIAS_WINOGRAD_F63
)
{
TEST_F
(
ARM_COMMON
,
BENCHMARK_CONVBIAS_WINOGRAD_F63
)
{
#if MEGDNN_AARCH64
#if MEGDNN_AARCH64
benchmark_winograd
(
"WINOGRAD:AARCH64_F32K8X12X1:1:6"
,
handle
(),
3
);
benchmark_winograd
(
"WINOGRAD:AARCH64_F32K8X12X1:1:6"
,
handle
(),
3
);
...
...
dnn/test/common/conv_bias.cpp
浏览文件 @
217999b1
...
@@ -902,7 +902,8 @@ void check_conv_bias(
...
@@ -902,7 +902,8 @@ void check_conv_bias(
}
}
#if MEGDNN_WITH_BENCHMARK
#if MEGDNN_WITH_BENCHMARK
std
::
vector
<
conv_bias
::
TestArg
>
get_winograd_benchmark_args
(
std
::
vector
<
conv_bias
::
TestArg
>
get_winograd_benchmark_args
(
size_t
kernel
,
size_t
pack_size
)
{
size_t
kernel
,
size_t
pack_size
,
size_t
io_pack_size
)
{
megdnn_assert
(
io_pack_size
==
1
||
io_pack_size
==
4
);
std
::
vector
<
conv_bias
::
TestArg
>
args
;
std
::
vector
<
conv_bias
::
TestArg
>
args
;
auto
pack
=
[
&
](
size_t
oc
,
size_t
ic
,
size_t
w
,
size_t
h
,
size_t
kernel
,
size_t
p
)
{
auto
pack
=
[
&
](
size_t
oc
,
size_t
ic
,
size_t
w
,
size_t
h
,
size_t
kernel
,
size_t
p
)
{
if
(
ic
%
pack_size
!=
0
||
oc
%
pack_size
!=
0
)
if
(
ic
%
pack_size
!=
0
||
oc
%
pack_size
!=
0
)
...
@@ -915,11 +916,20 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
...
@@ -915,11 +916,20 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
param
.
pad_h
=
p
;
param
.
pad_h
=
p
;
param
.
pad_w
=
p
;
param
.
pad_w
=
p
;
if
(
io_pack_size
==
4
)
{
param
.
format
=
param
::
ConvBias
::
Format
::
NCHW44
;
args
.
push_back
(
conv_bias
::
TestArg
{
param
,
TensorShape
{
1
,
ic
/
4
,
h
,
w
,
4
},
TensorShape
{
oc
/
4
,
ic
/
4
,
kernel
,
kernel
,
4
,
4
},
{
1
,
oc
/
4
,
1
,
1
,
4
}});
}
else
{
args
.
push_back
(
conv_bias
::
TestArg
{
args
.
push_back
(
conv_bias
::
TestArg
{
param
,
param
,
TensorShape
{
1
,
ic
,
h
,
w
},
TensorShape
{
1
,
ic
,
h
,
w
},
TensorShape
{
oc
,
ic
,
kernel
,
kernel
},
TensorShape
{
oc
,
ic
,
kernel
,
kernel
},
{
1
,
oc
,
1
,
1
}});
{
1
,
oc
,
1
,
1
}});
}
};
};
for
(
size_t
ic
:
{
8
,
16
,
32
,
64
})
{
for
(
size_t
ic
:
{
8
,
16
,
32
,
64
})
{
...
@@ -950,8 +960,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
...
@@ -950,8 +960,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
}
}
void
benchmark_winograd
(
void
benchmark_winograd
(
const
char
*
algo_name
,
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
)
{
const
char
*
algo_name
,
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
,
auto
&&
args
=
get_winograd_benchmark_args
(
kernel
,
pack_size
);
size_t
io_pack_size
)
{
auto
&&
args
=
get_winograd_benchmark_args
(
kernel
,
pack_size
,
io_pack_size
);
using
namespace
conv_bias
;
using
namespace
conv_bias
;
constexpr
size_t
RUN
=
10
;
constexpr
size_t
RUN
=
10
;
Benchmarker
<
Convolution
>
benchmark
(
handle
);
Benchmarker
<
Convolution
>
benchmark
(
handle
);
...
@@ -969,10 +980,17 @@ void benchmark_winograd(
...
@@ -969,10 +980,17 @@ void benchmark_winograd(
opr
->
deduce_layout
(
opr
->
deduce_layout
(
{
arg
.
src
,
dtype
::
Float32
()},
{
arg
.
filter
,
dtype
::
Float32
()},
{
arg
.
src
,
dtype
::
Float32
()},
{
arg
.
filter
,
dtype
::
Float32
()},
{
arg
.
bias
,
dtype
::
Float32
()},
{},
dst_layout
);
{
arg
.
bias
,
dtype
::
Float32
()},
{},
dst_layout
);
float
computations
=
0.0
;
if
(
io_pack_size
==
1
)
{
//! dst.nr_elems * IC * FH * FW * 2
//! dst.nr_elems * IC * FH * FW * 2
float
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]
*
2.0
/
arg
.
filter
[
3
]
*
2.0
/
(
1024
*
1024
*
1024
)
*
1e3
;
(
1024
*
1024
*
1024
)
*
1e3
;
}
else
{
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]
*
arg
.
filter
[
4
]
*
2.0
/
(
1024
*
1024
*
1024
)
*
1e3
;
}
param
::
Convolution
conv_param
;
param
::
Convolution
conv_param
;
conv_param
.
pad_h
=
arg
.
param
.
pad_h
;
conv_param
.
pad_h
=
arg
.
param
.
pad_h
;
...
@@ -999,9 +1017,9 @@ void benchmark_winograd(
...
@@ -999,9 +1017,9 @@ void benchmark_winograd(
// usage of weight pre-processing for winograd benchmark
// usage of weight pre-processing for winograd benchmark
void
benchmark_winograd_weight_preprocess
(
void
benchmark_winograd_weight_preprocess
(
const
char
*
algo_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
const
char
*
algo_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
,
size_t
pack_size
)
{
size_t
io_
pack_size
)
{
auto
&&
args
=
get_winograd_benchmark_args
(
kernel
,
pack_size
);
auto
&&
args
=
get_winograd_benchmark_args
(
kernel
,
pack_size
,
io_pack_size
);
using
namespace
conv_bias
;
using
namespace
conv_bias
;
constexpr
size_t
RUN
=
10
;
constexpr
size_t
RUN
=
10
;
...
@@ -1018,16 +1036,17 @@ void benchmark_winograd_weight_preprocess(
...
@@ -1018,16 +1036,17 @@ void benchmark_winograd_weight_preprocess(
opr
->
deduce_layout
(
opr
->
deduce_layout
(
{
arg
.
src
,
dtype
::
Float32
()},
{
arg
.
filter
,
dtype
::
Float32
()},
{
arg
.
src
,
dtype
::
Float32
()},
{
arg
.
filter
,
dtype
::
Float32
()},
{
arg
.
bias
,
dtype
::
Float32
()},
{},
dst_layout
);
{
arg
.
bias
,
dtype
::
Float32
()},
{},
dst_layout
);
float
computations
=
0.0
;
if
(
io_pack_size
==
1
)
{
//! dst.nr_elems * IC * FH * FW * 2
//! dst.nr_elems * IC * FH * FW * 2
float
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]
*
2.0
/
arg
.
filter
[
3
]
*
2.0
/
(
1024
*
1024
*
1024
)
*
1e3
;
(
1024
*
1024
*
1024
)
*
1e3
;
}
else
{
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2
param
::
Convolution
conv_param
;
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
conv_param
.
pad_h
=
arg
.
param
.
pad_h
;
arg
.
filter
[
3
]
*
arg
.
filter
[
4
]
*
2.0
/
(
1024
*
1024
*
1024
)
*
conv_param
.
pad_w
=
arg
.
param
.
pad_w
;
1e3
;
conv_param
.
stride_h
=
arg
.
param
.
stride_h
;
}
conv_param
.
stride_w
=
arg
.
param
.
stride_w
;
benchmark_winograd
.
set_param
(
arg
.
param
);
benchmark_winograd
.
set_param
(
arg
.
param
);
auto
used_winograd
=
auto
used_winograd
=
...
@@ -1045,8 +1064,8 @@ void benchmark_winograd_weight_preprocess(
...
@@ -1045,8 +1064,8 @@ void benchmark_winograd_weight_preprocess(
void
benchmark_winograd_compare
(
void
benchmark_winograd_compare
(
const
char
*
algoA_name
,
const
char
*
algoB_name
,
megdnn
::
Handle
*
handle
,
const
char
*
algoA_name
,
const
char
*
algoB_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
)
{
size_t
kernel
,
size_t
pack_size
,
size_t
io_pack_size
)
{
auto
&&
args
=
get_winograd_benchmark_args
(
kernel
,
pack_size
);
auto
&&
args
=
get_winograd_benchmark_args
(
kernel
,
pack_size
,
io_pack_size
);
using
namespace
conv_bias
;
using
namespace
conv_bias
;
constexpr
size_t
RUN
=
10
;
constexpr
size_t
RUN
=
10
;
...
@@ -1062,16 +1081,17 @@ void benchmark_winograd_compare(
...
@@ -1062,16 +1081,17 @@ void benchmark_winograd_compare(
opr
->
deduce_layout
(
opr
->
deduce_layout
(
{
arg
.
src
,
dtype
::
Float32
()},
{
arg
.
filter
,
dtype
::
Float32
()},
{
arg
.
src
,
dtype
::
Float32
()},
{
arg
.
filter
,
dtype
::
Float32
()},
{
arg
.
bias
,
dtype
::
Float32
()},
{},
dst_layout
);
{
arg
.
bias
,
dtype
::
Float32
()},
{},
dst_layout
);
float
computations
=
0.0
;
if
(
io_pack_size
==
1
)
{
//! dst.nr_elems * IC * FH * FW * 2
//! dst.nr_elems * IC * FH * FW * 2
float
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
2
]
*
arg
.
filter
[
3
]
*
2.0
/
arg
.
filter
[
3
]
*
2.0
/
(
1024
*
1024
*
1024
)
*
1e3
;
(
1024
*
1024
*
1024
)
*
1e3
;
}
else
{
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2
param
::
Convolution
conv_param
;
computations
=
dst_layout
.
total_nr_elems
()
*
arg
.
filter
[
1
]
*
arg
.
filter
[
2
]
*
conv_param
.
pad_h
=
arg
.
param
.
pad_h
;
arg
.
filter
[
3
]
*
arg
.
filter
[
4
]
*
2.0
/
(
1024
*
1024
*
1024
)
*
conv_param
.
pad_w
=
arg
.
param
.
pad_w
;
1e3
;
conv_param
.
stride_h
=
arg
.
param
.
stride_h
;
}
conv_param
.
stride_w
=
arg
.
param
.
stride_w
;
benchmark_winograd
.
set_param
(
arg
.
param
);
benchmark_winograd
.
set_param
(
arg
.
param
);
auto
used_winograd1
=
auto
used_winograd1
=
...
...
dnn/test/common/conv_bias.h
浏览文件 @
217999b1
...
@@ -62,16 +62,16 @@ void check_conv_bias(
...
@@ -62,16 +62,16 @@ void check_conv_bias(
#if MEGDNN_WITH_BENCHMARK
#if MEGDNN_WITH_BENCHMARK
std
::
vector
<
conv_bias
::
TestArg
>
get_winograd_benchmark_args
(
std
::
vector
<
conv_bias
::
TestArg
>
get_winograd_benchmark_args
(
size_t
kernel
,
size_t
pack_size
=
1
);
size_t
kernel
,
size_t
pack_size
=
1
,
size_t
io_pack_size
=
1
);
void
benchmark_winograd
(
void
benchmark_winograd
(
const
char
*
algo_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
const
char
*
algo_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
=
1
);
size_t
pack_size
=
1
,
size_t
io_pack_size
=
1
);
void
benchmark_winograd_weight_preprocess
(
void
benchmark_winograd_weight_preprocess
(
const
char
*
algo_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
const
char
*
algo_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
=
1
);
size_t
pack_size
=
1
,
size_t
io_pack_size
=
1
);
void
benchmark_winograd_compare
(
void
benchmark_winograd_compare
(
const
char
*
algoA_name
,
const
char
*
algoB_name
,
megdnn
::
Handle
*
handle
,
const
char
*
algoA_name
,
const
char
*
algoB_name
,
megdnn
::
Handle
*
handle
,
size_t
kernel
,
size_t
pack_size
=
1
);
size_t
kernel
,
size_t
pack_size
=
1
,
size_t
io_pack_size
=
1
);
#endif // MEGDNN_WITH_BENCHMARK
#endif // MEGDNN_WITH_BENCHMARK
template
<
class
Checker
>
template
<
class
Checker
>
void
check_winograd
(
void
check_winograd
(
...
...
dnn/test/fallback/conv_bias.cpp
浏览文件 @
217999b1
...
@@ -597,6 +597,25 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) {
...
@@ -597,6 +597,25 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) {
param
::
ConvBias
::
Format
::
NCHW44
);
param
::
ConvBias
::
Format
::
NCHW44
);
}
}
TEST_F
(
FALLBACK_MULTI_THREADS
,
CONVBIAS_GI_WINOGRAD_F43_4_NCHW44
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
=
get_nchw44_conv_bias_args
({
3
},
QUAN_NLMODE
,
BR_AND_NO_BIASMODE
,
1
);
Checker
<
ConvBiasForward
>
checker
(
handle
());
check_winograd
(
"4:4:16"
,
checker
,
args
,
param
::
MatrixMul
::
Format
::
MK4
,
param
::
ConvBias
::
Format
::
NCHW44
);
}
TEST_F
(
FALLBACK_MULTI_THREADS
,
CONVBIAS_GI_WINOGRAD_F43_4_WEIGHT_PREPROCESS
)
{
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
=
get_winograd_mk_packed_args
();
Checker
<
ConvBiasForward
,
OprWeightPreprocessProxy
<
ConvBiasForward
>>
checker
(
handle
());
check_winograd
(
"4:4:16"
,
checker
,
args
,
param
::
MatrixMul
::
Format
::
MK4
);
}
TEST_F
(
FALLBACK_MULTI_THREADS
,
CONVBIAS_GI_WINOGRAD_F54
)
{
TEST_F
(
FALLBACK_MULTI_THREADS
,
CONVBIAS_GI_WINOGRAD_F54
)
{
using
namespace
conv_bias
;
using
namespace
conv_bias
;
std
::
vector
<
TestArg
>
args
=
get_winograd_args
(
4
);
std
::
vector
<
TestArg
>
args
=
get_winograd_args
(
4
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录