Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
ad3c9315
MegEngine
项目概览
MegEngine 天元
/
MegEngine
10 个月 前同步成功
通知
392
Star
4702
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
ad3c9315
编写于
5月 21, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(dnn/arm): add arm nchw44 fp32 pooling
GitOrigin-RevId: 6a26dad0a1dd2963b1b8f4d6c746dfcb58d911b5
上级
d19a6379
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
733 addition
and
187 deletion
+733
-187
dnn/src/arm_common/conv_bias/intrinsic_helper.h
dnn/src/arm_common/conv_bias/intrinsic_helper.h
+5
-181
dnn/src/arm_common/intrinsic_helper.h
dnn/src/arm_common/intrinsic_helper.h
+126
-0
dnn/src/arm_common/neon_struct.h
dnn/src/arm_common/neon_struct.h
+5
-3
dnn/src/arm_common/pooling/algo.h
dnn/src/arm_common/pooling/algo.h
+7
-1
dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
+126
-0
dnn/src/arm_common/pooling/kern_fp32_pooling_nchw44.h
dnn/src/arm_common/pooling/kern_fp32_pooling_nchw44.h
+308
-0
dnn/src/arm_common/pooling/opr_impl.cpp
dnn/src/arm_common/pooling/opr_impl.cpp
+2
-0
dnn/src/arm_common/pooling/opr_impl.h
dnn/src/arm_common/pooling/opr_impl.h
+1
-1
dnn/src/common/unroll_macro.h
dnn/src/common/unroll_macro.h
+27
-1
dnn/test/arm_common/pooling.cpp
dnn/test/arm_common/pooling.cpp
+67
-0
dnn/test/arm_common/pooling_multi_thread.cpp
dnn/test/arm_common/pooling_multi_thread.cpp
+59
-0
未找到文件。
dnn/src/arm_common/conv_bias/intrinsic_helper.h
浏览文件 @
ad3c9315
#pragma once
/**
* \file dnn/src/arm_common/conv_bias/intrinsic_helper.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
...
...
@@ -10,7 +9,9 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/arm_common/conv_bias/neon_struct.h"
#pragma once
#include "src/arm_common/intrinsic_helper.h"
#include "src/arm_common/neon_struct.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/unroll_macro.h"
#include "src/fallback/conv_bias/common.h"
...
...
@@ -689,185 +690,8 @@ inline void init_ocx_ow4(T& c, const int32_t* bias_ptr, int oc_step) {
InitOcxOw4
<
c_dim
,
bias_mode
,
T
>::
impl
(
c
,
bias_ptr
,
oc_step
);
}
///////////////////////////////////////
template
<
int
weight_number
,
int
base_offset
,
int
ptr_step
,
int
oc_block
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
{
static
void
impl
(
T
&
weight
,
T2
ptr
,
int
oc_offset
,
XT
...
args
);
};
#define WEIGHT_CB(step) \
src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
1
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
1
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
2
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
2
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
3
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
3
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
4
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
4
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
5
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
5
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
6
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
6
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
7
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
7
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
<
8
,
base_offset
,
ptr_step
,
0
,
Func
,
T
,
T2
,
XT
...
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
,
XT
...
args
)
{
UNROLL_CALL_RAW
(
8
,
WEIGHT_CB
);
}
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step);
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
1
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
1
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
2
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
2
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
3
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
3
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
4
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
4
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
5
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
5
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
6
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
6
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
7
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
7
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
8
,
base_offset
,
ptr_step
,
1
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
)
{
UNROLL_CALL_RAW
(
8
,
WEIGHT_CB
);
}
};
#undef WEIGHT_CB
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \
src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset);
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
1
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
1
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
2
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
2
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
3
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
3
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
4
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
4
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
5
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
5
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
6
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
6
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
7
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
7
,
WEIGHT_CB
);
}
};
template
<
int
base_offset
,
int
ptr_step
,
typename
Func
,
typename
T
,
typename
T2
>
struct
LoadHelper
<
8
,
base_offset
,
ptr_step
,
2
,
Func
,
T
,
T2
>
{
static
void
impl
(
T
&
src
,
T2
ptr
,
int
oc_offset
)
{
UNROLL_CALL_RAW
(
8
,
WEIGHT_CB
);
}
};
#undef WEIGHT_CB
template
<
int
weight_number
,
int
base_offset
,
int
ptr_step
,
int
c_dim
,
typename
Func
,
typename
T
,
typename
T2
>
inline
void
load_helper
(
T
&
weight
,
T2
ptr
,
int
oc_offset
)
{
LoadHelper
<
weight_number
,
base_offset
,
ptr_step
,
c_dim
,
Func
,
T
,
T2
>::
impl
(
weight
,
ptr
,
oc_offset
);
}
template
<
int
weight_number
,
int
base_offset
,
int
ptr_step
,
int
c_dim
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
inline
void
load_helper_x
(
T
&
weight
,
T2
ptr
,
int
oc_offset
,
XT
...
args
)
{
LoadHelper
<
weight_number
,
base_offset
,
ptr_step
,
c_dim
,
Func
,
T
,
T2
,
XT
...
>::
impl
(
weight
,
ptr
,
oc_offset
,
args
...);
}
}
// namespace
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/arm_common/intrinsic_helper.h
0 → 100644
浏览文件 @
ad3c9315
/**
* \file dnn/src/arm_common/intrinsic_helper.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/arm_common/neon_struct.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/unroll_macro.h"
namespace
megdnn
{
namespace
{
template
<
int
weight_number
,
int
base_offset
,
int
ptr_step
,
int
oc_block
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
struct
LoadHelper
{
static
void
impl
(
T
&
weight
,
T2
ptr
,
int
oc_offset
,
XT
...
args
);
};
#define WEIGHT_CB(step) \
src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
#define LOAD_HELPER(step) \
template <int base_offset, int ptr_step, typename Func, typename T, \
typename T2, typename... XT> \
struct LoadHelper<step, base_offset, ptr_step, 0, Func, T, T2, XT...> { \
static void impl(T& src, T2 ptr, int, XT... args) { \
UNROLL_CALL_RAW(step, WEIGHT_CB); \
} \
}
LOAD_HELPER
(
1
);
LOAD_HELPER
(
2
);
LOAD_HELPER
(
3
);
LOAD_HELPER
(
4
);
LOAD_HELPER
(
5
);
LOAD_HELPER
(
6
);
LOAD_HELPER
(
7
);
LOAD_HELPER
(
8
);
LOAD_HELPER
(
9
);
LOAD_HELPER
(
10
);
LOAD_HELPER
(
11
);
LOAD_HELPER
(
12
);
LOAD_HELPER
(
13
);
LOAD_HELPER
(
14
);
LOAD_HELPER
(
15
);
LOAD_HELPER
(
16
);
#undef LOAD_HELPER
#undef WEIGHT_CB
///////////////////////////c_dim = 1/////////////////////////
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step);
#define LOAD_HELPER(step) \
template <int base_offset, int ptr_step, typename Func, typename T, \
typename T2> \
struct LoadHelper<step, base_offset, ptr_step, 1, Func, T, T2> { \
static void impl(T& src, T2 ptr, int) { \
UNROLL_CALL_RAW(step, WEIGHT_CB); \
} \
}
LOAD_HELPER
(
1
);
LOAD_HELPER
(
2
);
LOAD_HELPER
(
3
);
LOAD_HELPER
(
4
);
LOAD_HELPER
(
5
);
LOAD_HELPER
(
6
);
LOAD_HELPER
(
7
);
LOAD_HELPER
(
8
);
LOAD_HELPER
(
9
);
#undef LOAD_HELPER
#undef WEIGHT_CB
/////////////////////////c_dim = 2///////////////////////////////
#define WEIGHT_CB(step) \
src[0][step] = Func::impl(ptr + base_offset + step * ptr_step); \
src[1][step] = Func::impl(ptr + base_offset + step * ptr_step + oc_offset);
#define LOAD_HELPER(step) \
template <int base_offset, int ptr_step, typename Func, typename T, \
typename T2> \
struct LoadHelper<step, base_offset, ptr_step, 2, Func, T, T2> { \
static void impl(T& src, T2 ptr, int oc_offset) { \
UNROLL_CALL_RAW(step, WEIGHT_CB); \
} \
}
LOAD_HELPER
(
1
);
LOAD_HELPER
(
2
);
LOAD_HELPER
(
3
);
LOAD_HELPER
(
4
);
LOAD_HELPER
(
5
);
LOAD_HELPER
(
6
);
LOAD_HELPER
(
7
);
LOAD_HELPER
(
8
);
#undef LOAD_HELPER
#undef WEIGHT_CB
template
<
int
weight_number
,
int
base_offset
,
int
ptr_step
,
int
c_dim
,
typename
Func
,
typename
T
,
typename
T2
>
inline
void
load_helper
(
T
&
weight
,
T2
ptr
,
int
oc_offset
)
{
LoadHelper
<
weight_number
,
base_offset
,
ptr_step
,
c_dim
,
Func
,
T
,
T2
>::
impl
(
weight
,
ptr
,
oc_offset
);
}
template
<
int
weight_number
,
int
base_offset
,
int
ptr_step
,
int
c_dim
,
typename
Func
,
typename
T
,
typename
T2
,
typename
...
XT
>
inline
void
load_helper_x
(
T
&
weight
,
T2
ptr
,
int
oc_offset
,
XT
...
args
)
{
LoadHelper
<
weight_number
,
base_offset
,
ptr_step
,
c_dim
,
Func
,
T
,
T2
,
XT
...
>::
impl
(
weight
,
ptr
,
oc_offset
,
args
...);
}
}
// namespace
}
// namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
dnn/src/arm_common/
conv_bias/
neon_struct.h
→
dnn/src/arm_common/neon_struct.h
浏览文件 @
ad3c9315
#pragma once
/**
* \file dnn/src/arm_common/
conv_bias/
neon_struct.h
* \file dnn/src/arm_common/neon_struct.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
...
...
@@ -10,6 +9,7 @@
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include "src/arm_common/simd_macro/marm_neon.h"
namespace
megdnn
{
namespace
{
...
...
@@ -62,4 +62,6 @@ struct Vfmaq_laneq_f32 {
};
}
// namespace
}
// namespace megdnn
\ No newline at end of file
}
// namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
dnn/src/arm_common/pooling/algo.h
浏览文件 @
ad3c9315
...
...
@@ -114,7 +114,13 @@ public:
bool
usable
(
const
PoolingKernSizeParam
&
param
)
const
override
;
void
exec
(
const
PoolingKernParam
&
param
)
const
override
;
};
class
PoolingImpl
::
AlgoFp32ModexStridexNCHW44
final
:
public
AlgoBase
{
public:
bool
is_reproducible
()
const
override
{
return
true
;
}
const
char
*
name
()
const
override
{
return
"ARM_POOLING_FP32_MODEX_STRIDEX_NCHW44"
;
}
bool
usable
(
const
PoolingKernSizeParam
&
param
)
const
override
;
void
exec
(
const
PoolingKernParam
&
param
)
const
override
;
};
WorkspaceBundle
get_bundle
(
const
PoolingImpl
::
PoolingKernSizeParam
&
param
);
WorkspaceBundle
get_bundle_nchw44
(
...
...
dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
0 → 100644
浏览文件 @
ad3c9315
/**
* \file dnn/src/arm_common/pooling/algo_fp32_pooling_nchw44.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megdnn/opr_param_defs.h"
#include "src/arm_common/pooling/algo.h"
#include "src/arm_common/pooling/kern_fp32_pooling_nchw44.h"
#include "midout.h"
MIDOUT_DECL
(
megdnn_arm_common_fp32_pooling_nchw44
)
namespace
megdnn
{
namespace
arm_common
{
bool
PoolingImpl
::
AlgoFp32ModexStridexNCHW44
::
usable
(
const
PoolingKernSizeParam
&
param
)
const
{
uint32_t
sh
=
param
.
stride
[
0
];
uint32_t
sw
=
param
.
stride
[
1
];
uint32_t
fh
=
param
.
filter
[
0
];
uint32_t
fw
=
param
.
filter
[
1
];
bool
avaible
=
param
.
src_type
.
enumv
()
==
DTypeEnum
::
Float32
&&
param
.
format
==
Param
::
Format
::
NCHW44
&&
(
param
.
mode
==
Mode
::
MAX
||
param
.
mode
==
Mode
::
AVERAGE
)
&&
fh
==
fw
&&
sh
==
sw
&&
(
fh
==
2
||
fh
==
3
||
fh
==
4
||
fh
==
5
)
&&
(
sh
==
1
||
sh
==
2
);
return
avaible
;
}
void
PoolingImpl
::
AlgoFp32ModexStridexNCHW44
::
exec
(
const
PoolingKernParam
&
param
)
const
{
int
ih
=
param
.
isz
[
0
];
int
iw
=
param
.
isz
[
1
];
int
oh
=
param
.
osz
[
0
];
int
ow
=
param
.
osz
[
1
];
int
n
=
param
.
n
;
int
ic
=
param
.
ic
;
int
ph
=
param
.
padding
[
0
];
int
pw
=
param
.
padding
[
1
];
int
sh
=
param
.
stride
[
0
];
int
fh
=
param
.
filter
[
0
];
void
*
src_ptr
=
param
.
src_ptr
;
void
*
dst_ptr
=
param
.
dst_ptr
;
#define DISPATCH_FUNC(filter, stride, mode) \
MIDOUT_BEGIN(megdnn_arm_common_fp32_pooling_nchw44, midout_iv(0), \
midout_iv(#filter #stride #mode##_hash)) { \
auto run = [ih, iw, oh, ow, ph, pw, src_ptr, dst_ptr](size_t index, \
size_t) { \
const int c_idx = index; \
pooling_fp32_nchw44<filter, stride, mode>( \
static_cast<const float*>(src_ptr) + c_idx * ih * iw * 4, \
static_cast<float*>(dst_ptr) + c_idx * oh * ow * 4, ih, \
iw, oh, ow, ph, pw); \
}; \
MEGDNN_DISPATCH_MULTI_THREAD_CPU_KERN( \
static_cast<::megdnn::naive::HandleImpl*>(param.handle), \
n* ic, run); \
} \
MIDOUT_END();
#define DISPATCH_MODE(filter, stride) \
switch (param.mode) { \
case PoolingBase::Mode::MAX: \
DISPATCH_FUNC(filter, stride, PoolingBase::Mode::MAX); \
break; \
case PoolingBase::Mode::AVERAGE: \
DISPATCH_FUNC(filter, stride, PoolingBase::Mode::AVERAGE); \
break; \
default: \
megdnn_assert(0, "invalid mode %u", \
static_cast<uint32_t>(param.mode)); \
}
#define DISPATCH_STRIDE(filter) \
switch (sh) { \
case 1: \
DISPATCH_MODE(filter, 1); \
break; \
case 2: \
DISPATCH_MODE(filter, 2); \
break; \
default: \
megdnn_assert(0, "invalid stride %d", sh); \
}
#define DISPATCH_FILTER() \
switch (fh) { \
case 2: \
DISPATCH_STRIDE(2); \
break; \
case 3: \
DISPATCH_STRIDE(3); \
break; \
case 4: \
DISPATCH_STRIDE(4); \
break; \
case 5: \
DISPATCH_STRIDE(5); \
break; \
default: \
megdnn_assert(0, "invalid filter %d", fh); \
}
DISPATCH_FILTER
()
#undef DISPATCH_FILTER
#undef DISPATCH_STRIDE
#undef DISPATCH_MODE
#undef DISPATCH_FUNC
}
}
// namespace arm_common
}
// namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
dnn/src/arm_common/pooling/kern_fp32_pooling_nchw44.h
0 → 100644
浏览文件 @
ad3c9315
/**
* \file dnn/src/arm_common/pooling/kern_fp32_pooling_nchw44.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <limits>
#include "megdnn/opr_param_defs.h"
#include "src/arm_common/intrinsic_helper.h"
#include "src/arm_common/neon_struct.h"
#include "src/arm_common/simd_macro/marm_neon.h"
#include "src/common/unroll_macro.h"
namespace
megdnn
{
namespace
arm_common
{
namespace
{
template
<
int
filter
,
int
stride
,
int
ow_step
,
PoolingBase
::
Mode
mode
,
typename
T1
,
typename
T2
>
struct
CalXsXNchw44
{
static
void
impl
(
T1
result
,
T2
src
);
};
template
<
int
filter
,
int
stride
,
int
ow_step
,
PoolingBase
::
Mode
mode
,
typename
T1
,
typename
T2
>
void
calculate_xsx_nchw44
(
T1
result
,
T2
src
)
{
CalXsXNchw44
<
filter
,
stride
,
ow_step
,
mode
,
T1
,
T2
>::
impl
(
result
,
src
);
};
#define CALCULATE_MAX_CB(step) \
result[0] = vmaxq_f32(result[0], src[0 * stride + step]); \
result[1] = vmaxq_f32(result[1], src[1 * stride + step]); \
result[2] = vmaxq_f32(result[2], src[2 * stride + step]); \
result[3] = vmaxq_f32(result[3], src[3 * stride + step]);
#define CALCULATE_AVG_CB(step) \
result[0] = vaddq_f32(result[0], src[0 * stride + step]); \
result[1] = vaddq_f32(result[1], src[1 * stride + step]); \
result[2] = vaddq_f32(result[2], src[2 * stride + step]); \
result[3] = vaddq_f32(result[3], src[3 * stride + step]);
#define INSTANCE_CAL(filter) \
template <int stride, typename T1, typename T2> \
struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::MAX, T1, T2> { \
static void impl(T1 result, T2 src) { \
UNROLL_CALL_RAW(filter, CALCULATE_MAX_CB); \
} \
}; \
template <int stride, typename T1, typename T2> \
struct CalXsXNchw44<filter, stride, 4, PoolingBase::Mode::AVERAGE, T1, \
T2> { \
static void impl(T1 result, T2 src) { \
UNROLL_CALL_RAW(filter, CALCULATE_AVG_CB); \
} \
};
INSTANCE_CAL
(
2
)
INSTANCE_CAL
(
3
)
INSTANCE_CAL
(
4
)
INSTANCE_CAL
(
5
)
#undef INSTANCE_CAL
#undef CALCULATE_AVG_CB
#undef CALCULATE_MAX_CB
template
<
int
filter
,
int
stride
,
int
ow_step
,
PoolingBase
::
Mode
mode
>
struct
KerPoolingFilterXStrideXNchw44
{
static
void
impl
(
const
float32_t
*
src_ptr
,
float32_t
*
dst_ptr
,
size_t
iw
);
};
template
<
int
filter
,
int
stride
,
int
ow_step
>
struct
KerPoolingFilterXStrideXNchw44
<
filter
,
stride
,
ow_step
,
PoolingBase
::
Mode
::
MAX
>
{
static
void
impl
(
const
float32_t
*
src_ptr
,
float32_t
*
dst_ptr
,
size_t
iw
)
{
constexpr
int
src_reg_size
=
ow_step
*
stride
+
filter
-
stride
;
constexpr
int
packed_ic
=
4
;
constexpr
int
simd_len
=
4
;
constexpr
float
default_float
=
std
::
numeric_limits
<
float
>::
lowest
();
float32x4_t
result
[
ow_step
];
float32x4_t
src
[
src_reg_size
];
result
[
0
]
=
vdupq_n_f32
(
default_float
);
result
[
1
]
=
vdupq_n_f32
(
default_float
);
result
[
2
]
=
vdupq_n_f32
(
default_float
);
result
[
3
]
=
vdupq_n_f32
(
default_float
);
for
(
int
fh_idx
=
0
;
fh_idx
<
filter
;
++
fh_idx
)
{
load_helper
<
src_reg_size
,
0
,
simd_len
,
0
,
Vld1q_f32
>
(
src
,
src_ptr
+
fh_idx
*
iw
*
packed_ic
,
0
);
calculate_xsx_nchw44
<
filter
,
stride
,
ow_step
,
PoolingBase
::
Mode
::
MAX
>
(
result
,
src
);
}
vst1q_f32
(
dst_ptr
+
0
*
packed_ic
,
result
[
0
]);
vst1q_f32
(
dst_ptr
+
1
*
packed_ic
,
result
[
1
]);
vst1q_f32
(
dst_ptr
+
2
*
packed_ic
,
result
[
2
]);
vst1q_f32
(
dst_ptr
+
3
*
packed_ic
,
result
[
3
]);
}
};
template
<
int
filter
,
int
stride
,
int
ow_step
>
struct
KerPoolingFilterXStrideXNchw44
<
filter
,
stride
,
ow_step
,
PoolingBase
::
Mode
::
AVERAGE
>
{
static
void
impl
(
const
float32_t
*
src_ptr
,
float32_t
*
dst_ptr
,
size_t
iw
)
{
constexpr
int
src_reg_size
=
ow_step
*
stride
+
filter
-
stride
;
constexpr
int
packed_ic
=
4
;
constexpr
int
simd_len
=
4
;
constexpr
float
default_float
=
0
;
constexpr
float
div_filter_size
=
1.
f
/
(
filter
*
filter
);
const
float32x4_t
div_filter_size_vec
=
vdupq_n_f32
(
div_filter_size
);
float32x4_t
result
[
ow_step
];
float32x4_t
src
[
src_reg_size
];
result
[
0
]
=
vdupq_n_f32
(
default_float
);
result
[
1
]
=
vdupq_n_f32
(
default_float
);
result
[
2
]
=
vdupq_n_f32
(
default_float
);
result
[
3
]
=
vdupq_n_f32
(
default_float
);
for
(
int
fh_idx
=
0
;
fh_idx
<
filter
;
++
fh_idx
)
{
load_helper
<
src_reg_size
,
0
,
simd_len
,
0
,
Vld1q_f32
>
(
src
,
src_ptr
+
fh_idx
*
iw
*
packed_ic
,
0
);
calculate_xsx_nchw44
<
filter
,
stride
,
ow_step
,
PoolingBase
::
Mode
::
AVERAGE
>
(
result
,
src
);
}
result
[
0
]
=
vmulq_f32
(
result
[
0
],
div_filter_size_vec
);
result
[
1
]
=
vmulq_f32
(
result
[
1
],
div_filter_size_vec
);
result
[
2
]
=
vmulq_f32
(
result
[
2
],
div_filter_size_vec
);
result
[
3
]
=
vmulq_f32
(
result
[
3
],
div_filter_size_vec
);
vst1q_f32
(
dst_ptr
+
0
*
packed_ic
,
result
[
0
]);
vst1q_f32
(
dst_ptr
+
1
*
packed_ic
,
result
[
1
]);
vst1q_f32
(
dst_ptr
+
2
*
packed_ic
,
result
[
2
]);
vst1q_f32
(
dst_ptr
+
3
*
packed_ic
,
result
[
3
]);
}
};
template
<
PoolingBase
::
Mode
mode
>
void
ker_pooling_nchw44_remain_pad
(
const
float32_t
*
src_ptr
,
float32_t
*
dst_ptr
,
const
int
iw
,
const
int
pad_top
,
const
int
pad_bottom
,
const
int
pad_left
,
const
int
pad_right
,
const
int
filter
);
template
<
>
void
ker_pooling_nchw44_remain_pad
<
PoolingBase
::
Mode
::
MAX
>
(
const
float32_t
*
src_ptr
,
float32_t
*
dst_ptr
,
const
int
iw
,
const
int
pad_top
,
const
int
pad_bottom
,
const
int
pad_left
,
const
int
pad_right
,
const
int
filter
)
{
constexpr
int
ic_step
=
4
;
const
int
ih_end
=
filter
-
pad_bottom
;
const
int
iw_end
=
filter
-
pad_right
;
float32x4_t
result
=
vdupq_n_f32
(
std
::
numeric_limits
<
float
>::
lowest
());
for
(
int
ih_idx
=
pad_top
;
ih_idx
<
ih_end
;
++
ih_idx
)
{
for
(
int
iw_idx
=
pad_left
;
iw_idx
<
iw_end
;
++
iw_idx
)
{
float32x4_t
src
=
vld1q_f32
(
src_ptr
+
(
iw_idx
-
pad_left
)
*
ic_step
);
result
=
vmaxq_f32
(
result
,
src
);
}
src_ptr
+=
iw
*
ic_step
;
}
vst1q_f32
(
dst_ptr
,
result
);
}
template
<
>
void
ker_pooling_nchw44_remain_pad
<
PoolingBase
::
Mode
::
AVERAGE
>
(
const
float32_t
*
src_ptr
,
float32_t
*
dst_ptr
,
const
int
iw
,
const
int
pad_top
,
const
int
pad_bottom
,
const
int
pad_left
,
const
int
pad_right
,
const
int
filter
)
{
constexpr
int
ic_step
=
4
;
const
int
ih_end
=
filter
-
pad_bottom
;
const
int
iw_end
=
filter
-
pad_right
;
const
float
div_filter_size
=
1.
f
/
(
filter
*
filter
);
const
float32x4_t
div_filter_size_vec
=
vdupq_n_f32
(
div_filter_size
);
float32x4_t
result
=
vdupq_n_f32
(
0.
f
);
for
(
int
ih_idx
=
pad_top
;
ih_idx
<
ih_end
;
++
ih_idx
)
{
for
(
int
iw_idx
=
pad_left
;
iw_idx
<
iw_end
;
++
iw_idx
)
{
float32x4_t
src
=
vld1q_f32
(
src_ptr
+
(
iw_idx
-
pad_left
)
*
ic_step
);
result
=
vaddq_f32
(
result
,
src
);
}
src_ptr
+=
iw
*
ic_step
;
}
result
=
vmulq_f32
(
result
,
div_filter_size_vec
);
vst1q_f32
(
dst_ptr
,
result
);
}
template
<
PoolingBase
::
Mode
mode
>
static
inline
void
kern_pooling_with_pad_nchw44
(
const
float32_t
*
src
,
float32_t
*
dst
,
const
int
filter
,
const
int
ow_start
,
const
int
ow_end
,
const
int
iw
,
const
int
ow
,
const
int
stride_w
,
const
int
pw
,
const
int
real_ih_idx
,
const
int
oh_idx
,
const
int
pad_top
,
const
int
pad_bottom
)
{
constexpr
int
ic_step
=
4
;
constexpr
int
oc_step
=
4
;
for
(
int
ow_idx
=
ow_start
;
ow_idx
<
ow_end
;
++
ow_idx
)
{
const
int
iw_idx
=
ow_idx
*
stride_w
;
const
int
real_iw_idx
=
std
::
max
(
iw_idx
-
pw
,
0
);
const
int
pad_left
=
std
::
max
(
0
,
pw
-
iw_idx
);
const
int
pad_right
=
std
::
max
(
0
,
iw_idx
-
pw
+
filter
-
iw
);
const
int
src_offset
=
(
real_ih_idx
*
iw
+
real_iw_idx
)
*
ic_step
;
const
int
dst_offset
=
(
oh_idx
*
ow
+
ow_idx
)
*
oc_step
;
ker_pooling_nchw44_remain_pad
<
mode
>
(
src
+
src_offset
,
dst
+
dst_offset
,
iw
,
pad_top
,
pad_bottom
,
pad_left
,
pad_right
,
filter
);
}
}
template
<
int
filter
,
int
stride
,
PoolingBase
::
Mode
mode
>
static
inline
void
pooling_fp32_nchw44_pad
(
const
float32_t
*
src
,
float32_t
*
dst
,
int
ih
,
int
iw
,
int
oh
,
int
ow
,
int
ph
,
int
pw
)
{
constexpr
int
stride_h
=
stride
;
constexpr
int
stride_w
=
stride
;
constexpr
int
ic_step
=
4
;
constexpr
int
oc_step
=
4
;
constexpr
int
ow_step
=
4
;
const
int
ow_pad_left_end
=
div_ceil
(
pw
,
stride_w
);
const
int
ow_pad_right_end
=
(
iw
-
filter
+
pw
-
1
)
/
stride_w
;
const
int
ow_pad_right_step_end
=
(
ow_pad_right_end
-
ow_pad_left_end
)
/
ow_step
*
ow_step
+
ow_pad_left_end
;
rep
(
oh_idx
,
oh
)
{
const
int
ih_idx
=
oh_idx
*
stride_h
;
const
int
real_ih_idx
=
std
::
max
(
ih_idx
-
ph
,
0
);
const
int
pad_top
=
std
::
max
(
0
,
ph
-
ih_idx
);
const
int
pad_bottom
=
std
::
max
(
0
,
ih_idx
-
ph
+
filter
-
ih
);
if
(
pad_top
>
0
||
pad_bottom
>
0
)
{
kern_pooling_with_pad_nchw44
<
mode
>
(
src
,
dst
,
filter
,
0
,
ow
,
iw
,
ow
,
stride_w
,
pw
,
real_ih_idx
,
oh_idx
,
pad_top
,
pad_bottom
);
}
else
{
kern_pooling_with_pad_nchw44
<
mode
>
(
src
,
dst
,
filter
,
0
,
ow_pad_left_end
,
iw
,
ow
,
stride_w
,
pw
,
real_ih_idx
,
oh_idx
,
pad_top
,
pad_bottom
);
for
(
int
ow_idx
=
ow_pad_left_end
;
ow_idx
<
ow_pad_right_step_end
;
ow_idx
+=
ow_step
)
{
const
int
iw_idx
=
ow_idx
*
stride_w
;
const
int
real_iw_idx
=
std
::
max
(
iw_idx
-
pw
,
0
);
const
int
src_offset
=
(
real_ih_idx
*
iw
+
real_iw_idx
)
*
ic_step
;
const
int
dst_offset
=
(
oh_idx
*
ow
+
ow_idx
)
*
oc_step
;
KerPoolingFilterXStrideXNchw44
<
filter
,
stride
,
ow_step
,
mode
>::
impl
(
src
+
src_offset
,
dst
+
dst_offset
,
iw
);
}
kern_pooling_with_pad_nchw44
<
mode
>
(
src
,
dst
,
filter
,
ow_pad_right_step_end
,
ow
,
iw
,
ow
,
stride_w
,
pw
,
real_ih_idx
,
oh_idx
,
pad_top
,
pad_bottom
);
}
}
}
template
<
int
filter
,
int
stride
,
PoolingBase
::
Mode
mode
>
static
inline
void
pooling_fp32_nchw44_no_pad
(
const
float32_t
*
src
,
float32_t
*
dst
,
int
,
int
iw
,
int
oh
,
int
ow
)
{
constexpr
int
stride_h
=
stride
;
constexpr
int
stride_w
=
stride
;
constexpr
int
ic_step
=
4
;
constexpr
int
oc_step
=
4
;
constexpr
int
ow_step
=
4
;
const
int
ow_end
=
ow
/
ow_step
*
ow_step
;
const
int
ow_remain
=
ow
-
ow_end
;
rep
(
oh_idx
,
oh
)
{
const
int
ih_idx
=
oh_idx
*
stride_h
;
const
int
src_ih_offset
=
ih_idx
*
iw
;
const
int
dst_oh_offset
=
oh_idx
*
ow
;
for
(
int
ow_idx
=
0
;
ow_idx
<
ow_end
;
ow_idx
+=
ow_step
)
{
const
int
iw_idx
=
ow_idx
*
stride_w
;
const
int
src_offset
=
(
src_ih_offset
+
iw_idx
)
*
ic_step
;
const
int
dst_offset
=
(
dst_oh_offset
+
ow_idx
)
*
oc_step
;
KerPoolingFilterXStrideXNchw44
<
filter
,
stride
,
ow_step
,
mode
>::
impl
(
src
+
src_offset
,
dst
+
dst_offset
,
iw
);
}
if
(
ow_remain
>
0
)
{
kern_pooling_with_pad_nchw44
<
mode
>
(
src
,
dst
,
filter
,
ow_end
,
ow
,
iw
,
ow
,
stride_w
,
0
,
ih_idx
,
oh_idx
,
0
,
0
);
}
}
}
template
<
int
filter
,
int
stride
,
PoolingBase
::
Mode
mode
>
static
inline
void
pooling_fp32_nchw44
(
const
float32_t
*
src
,
float32_t
*
dst
,
int
ih
,
int
iw
,
int
oh
,
int
ow
,
int
ph
,
int
pw
)
{
if
(
ph
>
0
||
pw
>
0
)
{
pooling_fp32_nchw44_pad
<
filter
,
stride
,
mode
>
(
src
,
dst
,
ih
,
iw
,
oh
,
ow
,
ph
,
pw
);
}
else
{
pooling_fp32_nchw44_no_pad
<
filter
,
stride
,
mode
>
(
src
,
dst
,
ih
,
iw
,
oh
,
ow
);
}
}
}
// namespace
}
// namespace arm_common
}
// namespace megdnn
// vim: syntax=cpp.doxygen
\ No newline at end of file
dnn/src/arm_common/pooling/opr_impl.cpp
浏览文件 @
ad3c9315
...
...
@@ -29,6 +29,7 @@ class PoolingImpl::AlgoPack : NonCopyableObj {
AlgoFilter3ModexStridexNCHW44
algo_filter3_modex_stridex_nchw4
;
AlgoFilter4ModexStridexNCHW44
algo_filter4_modex_stridex_nchw4
;
AlgoFilter5ModexStridexNCHW44
algo_filter5_modex_stridex_nchw4
;
AlgoFp32ModexStridexNCHW44
algo_fp32_modex_stridex_nchw44
;
public:
AlgoPack
()
{
...
...
@@ -44,6 +45,7 @@ public:
all_algos
.
emplace_back
(
&
algo_filter2_modex_stridex_nchw4
);
all_algos
.
emplace_back
(
&
algo_filter4_modex_stridex_nchw4
);
all_algos
.
emplace_back
(
&
algo_filter5_modex_stridex_nchw4
);
all_algos
.
emplace_back
(
&
algo_fp32_modex_stridex_nchw44
);
}
SmallVector
<
AlgoBase
*>
all_algos
;
};
...
...
dnn/src/arm_common/pooling/opr_impl.h
浏览文件 @
ad3c9315
...
...
@@ -87,10 +87,10 @@ private:
class
AlgoFilter3ModexStridexNCHW44
;
class
AlgoFilter4ModexStridexNCHW44
;
class
AlgoFilter5ModexStridexNCHW44
;
class
AlgoFp32ModexStridexNCHW44
;
class
AlgoPack
;
};
}
// namespace arm_common
}
// namespace megdnn
// vim: syntax=cpp.doxygen
dnn/src/common/unroll_macro.h
浏览文件 @
ad3c9315
...
...
@@ -6,7 +6,8 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
...
...
@@ -32,10 +33,33 @@
#define UNROLL_RAW9(cb, v0, a...) \
UNROLL_RAW8(cb, v0, ##a) \
cb(8, ##a)
#define UNROLL_RAW10(cb, v0, a...) \
UNROLL_RAW9(cb, v0, ##a) \
cb(9, ##a)
#define UNROLL_RAW11(cb, v0, a...) \
UNROLL_RAW10(cb, v0, ##a) \
cb(10, ##a)
#define UNROLL_RAW12(cb, v0, a...) \
UNROLL_RAW11(cb, v0, ##a) \
cb(11, ##a)
#define UNROLL_RAW13(cb, v0, a...) \
UNROLL_RAW12(cb, v0, ##a) \
cb(12, ##a)
#define UNROLL_RAW14(cb, v0, a...) \
UNROLL_RAW13(cb, v0, ##a) \
cb(13, ##a)
#define UNROLL_RAW15(cb, v0, a...) \
UNROLL_RAW14(cb, v0, ##a) \
cb(14, ##a)
// clang-format off
#define UNROLL_RAW16(cb, v0, a...) \
UNROLL_RAW8(cb, v0, ##a) \
cb(8, ##a) cb(9, ##a) cb(10, ##a) cb(11, ##a) cb(12, ##a) cb(13, ##a) \
cb(14, ##a) cb(15, ##a)
#define UNROLL_RAW17(cb, v0, a...) \
UNROLL_RAW16(cb, v0, ##a) \
cb(16, ##a)
#define UNROLL_RAW24(cb, v0, a...) \
UNROLL_RAW16(cb, v0, ##a) \
cb(16, ##a) cb(17, ##a) cb(18, ##a) cb(19, ##a) cb(20, ##a) cb(21, ##a) \
...
...
@@ -130,4 +154,6 @@
#define UNROLL_CALL_NOWRAPPER_D2(step, step2, cb) \
UNROLL_CALL_RAW_D2(step, step2, cb)
// clang-format on
// vim: syntax=cpp.doxygen
dnn/test/arm_common/pooling.cpp
浏览文件 @
ad3c9315
...
...
@@ -256,6 +256,73 @@ TEST_F(ARM_COMMON, POOLING_QUANTIZED) {
}
#if MEGDNN_WITH_BENCHMARK
void
benchmark_nchw44_fp32
(
Handle
*
handle
)
{
using
Param
=
param
::
Pooling
;
auto
run
=
[
&
](
size_t
n
,
size_t
c
,
size_t
h
,
size_t
w
,
size_t
filter
,
size_t
stride
,
size_t
pad
,
Param
::
Mode
mode
)
{
Param
param
;
param
.
window_h
=
param
.
window_w
=
filter
;
param
.
stride_h
=
param
.
stride_w
=
stride
;
param
.
pad_h
=
param
.
pad_w
=
pad
;
param
.
format
=
Param
::
Format
::
NCHW
;
param
.
mode
=
mode
;
TensorShape
nchw_shape
=
{
n
,
c
,
h
,
w
};
TensorShape
nchw44_shape
=
{
n
,
c
/
4
,
h
,
w
,
4
};
TensorLayout
dst_layout
;
auto
opr
=
handle
->
create_operator
<
Pooling
>
();
opr
->
param
()
=
param
;
opr
->
deduce_layout
({
nchw_shape
,
dtype
::
Float32
()},
dst_layout
);
float
calc_amount
=
dst_layout
.
total_nr_elems
()
*
param
.
window_h
*
param
.
window_w
;
Benchmarker
<
Pooling
>
benchmarker_float_nchw
(
handle
);
Benchmarker
<
Pooling
>
benchmarker_float_nchw44
(
handle
);
Benchmarker
<
Pooling
>
benchmarker_int_nchw44
(
handle
);
size_t
RUN
=
500
;
auto
t1
=
benchmarker_float_nchw
.
set_display
(
false
)
.
set_times
(
RUN
)
.
set_param
(
param
)
.
exec
({
nchw_shape
,
{}});
param
.
format
=
Param
::
Format
::
NCHW44
;
auto
t2
=
benchmarker_int_nchw44
.
set_display
(
false
)
.
set_times
(
RUN
)
.
set_param
(
param
)
.
execl
({{
nchw44_shape
,
dtype
::
QuantizedS8
(
1.0
)},
{{},
dtype
::
QuantizedS8
(
1.0
)}});
auto
t3
=
benchmarker_float_nchw44
.
set_display
(
false
)
.
set_times
(
RUN
)
.
set_param
(
param
)
.
exec
({
nchw44_shape
,
{}});
printf
(
"{%zu %zu %zu %zu} filter = %zu, stride = %zu pad = %zu
\n
"
"nchw_fp32={%.3f ms, %.3f Mflops}, "
"nchw44_int={%.3f ms, %.3f Mflops}, "
"nchw44_fp32={%.3f ms, %.3f Mflops, speed_up %f}
\n\n
"
,
n
,
c
,
h
,
w
,
filter
,
stride
,
pad
,
t1
/
RUN
,
calc_amount
/
(
t1
/
RUN
*
1000
),
t2
/
RUN
,
calc_amount
/
(
t2
/
RUN
*
1000
),
t3
/
RUN
,
calc_amount
/
(
t3
/
RUN
*
1000
),
t1
/
t3
);
};
// Resnet50
run
(
1
,
64
,
112
,
112
,
3
,
2
,
1
,
param
::
Pooling
::
Mode
::
MAX
);
run
(
1
,
2048
,
7
,
7
,
7
,
1
,
0
,
param
::
Pooling
::
Mode
::
AVERAGE
);
// VGG16
run
(
1
,
64
,
224
,
224
,
2
,
2
,
0
,
param
::
Pooling
::
Mode
::
MAX
);
run
(
1
,
128
,
112
,
112
,
2
,
2
,
0
,
param
::
Pooling
::
Mode
::
MAX
);
run
(
1
,
256
,
56
,
56
,
2
,
2
,
0
,
param
::
Pooling
::
Mode
::
MAX
);
run
(
1
,
512
,
28
,
28
,
2
,
2
,
0
,
param
::
Pooling
::
Mode
::
MAX
);
run
(
1
,
512
,
14
,
14
,
2
,
2
,
0
,
param
::
Pooling
::
Mode
::
MAX
);
}
TEST_F
(
ARM_COMMON
,
BENCHMARK_POOLING_NCHW44_FP32
)
{
benchmark_nchw44_fp32
(
handle
());
}
TEST_F
(
ARM_COMMON_MULTI_THREADS
,
BENCHMARK_POOLING_NCHW44_FP32
)
{
benchmark_nchw44_fp32
(
handle
());
}
TEST_F
(
ARM_COMMON
,
BENCHMARK_POOLING_INT8_W3x3_S2x2
)
{
using
Param
=
param
::
Pooling
;
...
...
dnn/test/arm_common/pooling_multi_thread.cpp
浏览文件 @
ad3c9315
...
...
@@ -57,6 +57,65 @@ TEST_F(ARM_COMMON_MULTI_THREADS, POOLING) {
}
}
std
::
vector
<
std
::
pair
<
param
::
Pooling
,
TensorShapeArray
>>
get_nchw44_pool_args
(
size_t
filter
,
size_t
stride
)
{
constexpr
size_t
ic_step
=
4
;
std
::
vector
<
std
::
pair
<
param
::
Pooling
,
TensorShapeArray
>>
args
;
for
(
size_t
n
:
{
1
,
2
})
for
(
size_t
c
:
{
4
,
8
})
for
(
size_t
ih
:
{
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
})
for
(
size_t
iw
:
{
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
})
for
(
size_t
ph
:
{
0
,
1
,
2
})
for
(
size_t
pw
:
{
0
,
1
,
2
})
for
(
auto
mode
:
{
param
::
Pooling
::
Mode
::
MAX
,
param
::
Pooling
::
Mode
::
AVERAGE
})
if
(
ih
+
2
*
ph
>=
filter
&&
iw
+
2
*
pw
>=
filter
&&
filter
>
ph
&&
filter
>
pw
)
{
param
::
Pooling
param
;
param
.
mode
=
mode
;
param
.
format
=
param
::
Pooling
::
Format
::
NCHW44
;
param
.
pad_h
=
ph
;
param
.
pad_w
=
pw
;
param
.
stride_h
=
param
.
stride_w
=
stride
;
param
.
window_h
=
param
.
window_w
=
filter
;
args
.
emplace_back
(
std
::
make_pair
(
param
,
TensorShapeArray
{{
n
,
c
/
ic_step
,
ih
,
iw
,
ic_step
},
{}}));
}
return
args
;
}
void
run_pooling_check
(
Handle
*
handle
,
std
::
vector
<
std
::
pair
<
param
::
Pooling
,
TensorShapeArray
>>
args
,
bool
is_int8
)
{
Checker
<
Pooling
>
checker
(
handle
);
UniformIntRNG
rng_int8
{
INT8_MIN
>>
1
,
INT8_MAX
>>
1
};
UniformIntRNG
rng_fp32
{
-
10
,
10
};
if
(
is_int8
)
{
checker
.
set_dtype
(
0
,
dtype
::
QuantizedS8
(
1.1
f
));
checker
.
set_rng
(
0
,
&
rng_int8
);
}
else
{
checker
.
set_rng
(
0
,
&
rng_fp32
);
}
for
(
auto
arg
:
args
)
{
checker
.
set_param
(
arg
.
first
).
exec
(
arg
.
second
);
}
}
TEST_F
(
ARM_COMMON_MULTI_THREADS
,
POOLING_NCHW44_FP32
)
{
for
(
auto
filter
:
{
2
,
3
,
4
,
5
})
for
(
auto
stride
:
{
1
,
2
})
{
run_pooling_check
(
handle
(),
get_nchw44_pool_args
(
filter
,
stride
),
false
);
}
}
TEST_F
(
ARM_COMMON_MULTI_THREADS
,
POOLING_W3x3_NCHW44
)
{
// clang-format off
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录