Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
ea93927d
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
404
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
ea93927d
编写于
3月 20, 2020
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix(dnn/fallbackls): fix fallback convolution and all conv_bias algos
GitOrigin-RevId: e37fbe0ffe9f8ab6325f41d9ee6857115630b17a
上级
d346c878
变更
13
显示空白变更内容
内联
并排
Showing
13 changed file
with
312 addition
and
238 deletion
+312
-238
dnn/src/fallback/conv_bias/algos.cpp
dnn/src/fallback/conv_bias/algos.cpp
+6
-0
dnn/src/fallback/conv_bias/im2col/algos.cpp
dnn/src/fallback/conv_bias/im2col/algos.cpp
+1
-2
dnn/src/fallback/conv_bias/opr_impl.cpp
dnn/src/fallback/conv_bias/opr_impl.cpp
+53
-45
dnn/src/fallback/conv_bias/opr_impl.h
dnn/src/fallback/conv_bias/opr_impl.h
+24
-13
dnn/src/fallback/convolution/algos.cpp
dnn/src/fallback/convolution/algos.cpp
+5
-2
dnn/src/fallback/convolution/algos.h
dnn/src/fallback/convolution/algos.h
+21
-6
dnn/src/fallback/convolution/opr_impl.cpp
dnn/src/fallback/convolution/opr_impl.cpp
+4
-30
dnn/src/fallback/convolution/opr_impl.h
dnn/src/fallback/convolution/opr_impl.h
+37
-0
dnn/src/x86/conv_bias/f32/algos.cpp
dnn/src/x86/conv_bias/f32/algos.cpp
+91
-74
dnn/src/x86/conv_bias/f32/algos.h
dnn/src/x86/conv_bias/f32/algos.h
+8
-4
dnn/src/x86/conv_bias/int8/algos.cpp
dnn/src/x86/conv_bias/int8/algos.cpp
+13
-10
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
+24
-26
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
+25
-26
未找到文件。
dnn/src/fallback/conv_bias/algos.cpp
浏览文件 @
ea93927d
...
...
@@ -213,11 +213,17 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoNaive::dispatch_kerns(
const
NCBKernParam
&
param
,
const
NCBKernIndex
&
ncb_index
)
{
MIDOUT_BEGIN
(
megdnn_fallback_naive
,
2
)
{
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
size_t
thread_id
=
ncb_index
.
thread_id
;
auto
thread_param
=
param
;
thread_param
.
workspace_ptr
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
param
.
workspace_ptr
)
+
thread_id
*
workspace_per_thread
);
thread_param
.
filter_ptr
=
param
.
filter
<
void
>
(
group_id
);
thread_param
.
dst_ptr
=
param
.
dst
<
void
>
(
batch_id
,
group_id
);
thread_param
.
src_ptr
=
param
.
src
<
void
>
(
batch_id
,
group_id
);
thread_param
.
bias_ptr
=
param
.
bias
<
void
>
(
batch_id
,
group_id
);
kern_default
(
opr_param
,
thread_param
);
}
MIDOUT_END
();
...
...
dnn/src/fallback/conv_bias/im2col/algos.cpp
浏览文件 @
ea93927d
...
...
@@ -111,7 +111,6 @@ static void copy_padding_kern(WorkspaceBundle bundle,
size_t
channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
padding_group_size
=
IH2
*
IW2
*
IC
;
size_t
input_channel_offset
=
IH
*
IW
*
channel_id
;
size_t
workspace_channel_offset
=
IH2
*
IW2
*
channel_id
;
size_t
workspace_group_offset
=
group_id
*
padding_group_size
;
size_t
workspace_batch_offset
=
...
...
@@ -123,7 +122,7 @@ static void copy_padding_kern(WorkspaceBundle bundle,
src_zp
=
param
.
src_type
.
param
<
dtype
::
Quantized8Asymm
>
().
zero_point
;
}
src_ctype
*
src
=
const_cast
<
src_ctype
*>
(
param
.
src
<
src_ctype
>
(
batch_id
,
group_id
)
+
input_channel_offset
);
param
.
src
<
src_ctype
>
(
batch_id
,
group_id
,
channel_id
)
);
src_ctype
*
src2
;
src2
=
static_cast
<
src_ctype
*>
(
bundle
.
get
(
Im2colBundelIndex
::
BUNDLE_PADDING_INDEX
))
+
...
...
dnn/src/fallback/conv_bias/opr_impl.cpp
浏览文件 @
ea93927d
...
...
@@ -246,10 +246,9 @@ void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param,
ConvBiasImpl
::
Algorithm
*
algo
)
{
auto
ncb_kerns
=
ncb_algo_dispatch_kerns
(
algo
,
param
);
for
(
auto
&&
kernel
:
ncb_kerns
)
{
auto
run
=
[
=
](
size_t
index
,
size_t
thread_id
)
{
auto
copy_param
=
param
;
auto
run
=
[
kernel
,
param
](
size_t
index
,
size_t
thread_id
)
{
CpuNDRange
ndrange_id
(
kernel
.
global_size
,
index
);
kernel
.
kern
(
copy_
param
,
{
thread_id
,
ndrange_id
});
kernel
.
kern
(
param
,
{
thread_id
,
ndrange_id
});
};
static_cast
<
naive
::
HandleImpl
*>
(
handle
())
->
dispatch_kern
(
run
,
kernel
.
global_size
.
total_size
());
...
...
@@ -328,28 +327,29 @@ const char* ConvBiasImpl::get_algorithm_set_name() const {
namespace
megdnn
{
namespace
fallback
{
//! when format is nchwxx and channel wise mode, multi group will pack
//! together, so pack_group_size is the number of packed group
template
<
typename
T
>
const
T
*
ConvBiasImpl
::
NCBKernParam
::
src
(
size_t
batch_id
,
size_t
group_id
,
size_t
group_pack_size
)
const
{
src_type
.
assert_is_compatible_ctype
<
T
>
();
const
T
*
ConvBiasImpl
::
NCBKernParam
::
src
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
channel_pack_id
,
size_t
group_pack_size
,
size_t
channel_pack_size
)
const
{
size_t
batch_offset
=
batch_id
*
inp_bs
*
src_type
.
size
();
size_t
group_offset
=
group_pack_size
*
group_id
*
filter_meta
.
icpg
*
size_t
group_offset
=
group_pack_size
*
group_
pack_
id
*
filter_meta
.
icpg
*
isz
[
0
]
*
isz
[
1
]
*
src_type
.
size
();
size_t
channel_offset
=
channel_pack_size
*
channel_pack_id
*
isz
[
0
]
*
isz
[
1
]
*
src_type
.
size
();
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
src_ptr
)
+
batch_offset
+
group_offset
);
batch_offset
+
group_offset
+
channel_offset
);
}
//! when format is nchwxx and channel wise mode, multi group will pack
//! together, so pack_group_size is the number of packed group
template
<
typename
T
>
const
T
*
ConvBiasImpl
::
NCBKernParam
::
filter
(
size_t
group_id
,
const
T
*
ConvBiasImpl
::
NCBKernParam
::
filter
(
size_t
group_
pack_
id
,
size_t
pack_group_size
)
const
{
size_t
group_offset
=
0
_z
;
switch
(
filter_meta
.
format
)
{
case
Param
::
Format
::
NCHW
:
{
group_offset
=
pack_group_size
*
group_id
*
filter_meta
.
icpg
*
group_offset
=
pack_group_size
*
group_
pack_
id
*
filter_meta
.
icpg
*
filter_meta
.
ocpg
*
filter_meta
.
spatial
[
0
]
*
filter_meta
.
spatial
[
1
]
*
filter_type
.
size
();
break
;
...
...
@@ -359,15 +359,15 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_id,
size_t
icpg
=
filter_meta
.
icpg
;
size_t
ocpg
=
filter_meta
.
ocpg
;
//! four format of weight layout
//! 1. {oc/8, ic/8, fh, fw, 8, 8},
2. {g, oc/8, ic/8, fh,
//!
fw, 8, 8}
//! 3. {g/8,
1, 1, fh, fw, 8, 8}, 3. {oc/8 ,
fh, fw, ic, 8}
//! 1. {oc/8, ic/8, fh, fw, 8, 8},
//!
2. {g, oc/8, ic/8, fh, fw, 8, 8},
//! 3. {g/8,
fh, fw, 1, 1, 8}, 4. {oc/8,
fh, fw, ic, 8}
megdnn_assert
((
icpg
%
8
==
0
&&
ocpg
%
8
==
0
)
||
(
group
%
8
==
0
&&
icpg
==
1
&&
ocpg
==
1
&&
pack_group_size
>
1
)
||
(
group
==
1
&&
ocpg
%
8
==
0
),
"The filter shepe is not right of nchw88"
);
group_offset
=
pack_group_size
*
group_id
*
filter_meta
.
icpg
*
group_offset
=
pack_group_size
*
group_
pack_
id
*
filter_meta
.
icpg
*
filter_meta
.
ocpg
*
filter_meta
.
spatial
[
0
]
*
filter_meta
.
spatial
[
1
]
*
filter_type
.
size
();
...
...
@@ -380,7 +380,7 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_id,
//! 2. {alpha, alpha, ocpg/8, icpg/8, 8, 8}
//! 3. {g, alpha, alpha, oc, ic, 8, 8}
//! 4. {alpha, alpha, oc, ic}
group_offset
=
pack_group_size
*
group_id
*
filter_meta
.
icpg
*
group_offset
=
pack_group_size
*
group_
pack_
id
*
filter_meta
.
icpg
*
filter_meta
.
ocpg
*
(
filter_meta
.
spatial
[
0
]
+
output_block_size
-
1
)
*
(
filter_meta
.
spatial
[
1
]
+
output_block_size
-
1
)
*
...
...
@@ -388,58 +388,66 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_id,
break
;
}
default:
megdnn_assert
(
"other filter format is not support yet"
);
megdnn_assert
(
0
,
"other filter format is not support yet"
);
}
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
filter_ptr
)
+
group_offset
);
}
//! when format is nchwxx and channel wise mode, multi group will pack
//! together, so pack_group_size is the number of packed group
template
<
typename
T
>
const
T
*
ConvBiasImpl
::
NCBKernParam
::
bias
(
size_t
batch_id
,
size_t
group_id
,
size_t
group_pack_size
)
const
{
bias_type
.
assert_is_compatible_ctype
<
T
>
();
const
T
*
ConvBiasImpl
::
NCBKernParam
::
bias
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
channel_pack_id
,
size_t
group_pack_size
,
size_t
channel_pack_size
)
const
{
size_t
batch_offset
=
0
_z
;
size_t
group_offset
=
0
_z
;
size_t
channel_offset
=
0
_z
;
if
(
bias_mode
==
BiasMode
::
BIAS
)
{
batch_offset
=
batch_id
*
bias_bs
*
bias_type
.
size
();
group_offset
=
group_pack_size
*
group_id
*
filter_meta
.
ocpg
*
osz
[
0
]
*
osz
[
1
]
*
bias_type
.
size
();
group_offset
=
group_pack_size
*
group_pack_id
*
filter_meta
.
ocpg
*
osz
[
0
]
*
osz
[
1
]
*
bias_type
.
size
();
channel_offset
=
channel_pack_size
*
channel_pack_id
*
osz
[
0
]
*
osz
[
1
]
*
bias_type
.
size
();
}
else
if
(
bias_mode
==
BiasMode
::
BROADCAST_CHANNEL_BIAS
)
{
group_offset
=
group_pack_size
*
group_id
*
filter_meta
.
ocpg
*
group_offset
=
group_pack_size
*
group_
pack_
id
*
filter_meta
.
ocpg
*
bias_type
.
size
();
channel_offset
=
channel_pack_size
*
channel_pack_id
*
bias_type
.
size
();
}
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
bias_ptr
)
+
batch_offset
+
group_offset
);
batch_offset
+
group_offset
+
channel_offset
);
}
//! when format is nchwxx and channel wise mode, multi group will pack
//! together, so pack_group_size is the number of packed group
template
<
typename
T
>
T
*
ConvBiasImpl
::
NCBKernParam
::
dst
(
size_t
batch_id
,
size_t
group_id
,
size_t
group_pack_size
)
const
{
dst_type
.
assert_is_compatible_ctype
<
T
>
();
T
*
ConvBiasImpl
::
NCBKernParam
::
dst
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
channel_pack_id
,
size_t
group_pack_size
,
size_t
channel_pack_size
)
const
{
size_t
batch_offset
=
batch_id
*
out_bs
*
dst_type
.
size
();
size_t
group_offset
=
group_pack_size
*
group_id
*
filter_meta
.
ocpg
*
size_t
group_offset
=
group_pack_size
*
group_
pack_
id
*
filter_meta
.
ocpg
*
osz
[
0
]
*
osz
[
1
]
*
dst_type
.
size
();
size_t
channel_offset
=
channel_pack_size
*
channel_pack_id
*
osz
[
0
]
*
osz
[
1
]
*
dst_type
.
size
();
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
dst_ptr
)
+
batch_offset
+
group_offset
);
batch_offset
+
group_offset
+
channel_offset
);
}
#define INST(T) \
template const T* ConvBiasImpl::NCBKernParam::src<T>( \
size_t batch_id, size_t group_id, size_t group_pack_size) const; \
size_t batch_id, size_t group_id, size_t channel_id, \
size_t group_pack_size, size_t channel_pack_size) const; \
template const T* ConvBiasImpl::NCBKernParam::bias<T>( \
size_t batch_id, size_t group_id, size_t group_pack_size) const; \
size_t batch_id, size_t group_id, size_t channel_id, \
size_t group_pack_size, size_t channel_pack_size) const; \
template const T* ConvBiasImpl::NCBKernParam::filter<T>( \
size_t group_id, size_t group_pack_size) const; \
template T* ConvBiasImpl::NCBKernParam::dst<T>( \
size_t batch_id, size_t group_id, size_t group_pack_size) const;
size_t batch_id, size_t group_id, size_t channel_id, \
size_t group_pack_size, size_t channel_pack_size) const;
#define INST_DT(d) INST(DTypeTrait<d>::ctype)
MEGDNN_FOREACH_COMPUTING_DTYPE
(
INST_DT
)
INST
(
void
)
#undef INST
#undef INST_DT
}
// namespace fallback
...
...
dnn/src/fallback/conv_bias/opr_impl.h
浏览文件 @
ea93927d
...
...
@@ -103,10 +103,32 @@ public:
src_type
.
assert_is_compatible_ctype
<
T
>
();
return
static_cast
<
const
T
*>
(
src_ptr
);
}
//! when format is nchwxx, multi channel will pack into one
//! chnannel_pack_id. pack_channel_size is the number of packed channel
//! when format is nchwxx and channel wise, multi group will pack into
//! one group_pack_id. group_pack_size is the number of packed group
//! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
template
<
typename
T
>
const
T
*
src
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
channel_pack_id
=
0
,
size_t
group_pack_size
=
1
,
size_t
channel_pack_size
=
1
)
const
;
template
<
typename
T
>
const
T
*
bias
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
channel_pack_id
=
0
,
size_t
group_pack_size
=
1
,
size_t
channel_pack_size
=
1
)
const
;
template
<
typename
T
>
const
T
*
src
(
size_t
batch_id
,
size_t
group_id
,
size_t
group_pack_size
=
1
_z
)
const
;
T
*
dst
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
channel_pack_id
=
0
,
size_t
group_pack_size
=
1
,
size_t
channel_pack_size
=
1
)
const
;
//! when format is nchwxx and channel wise, multi group will pack into
//! one group_pack_id. group_pack_size is the number of packed group
//! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
template
<
typename
T
>
const
T
*
filter
(
size_t
group_pack_id
,
size_t
pack_group_size
=
1
_z
)
const
;
template
<
typename
T
>
const
T
*
filter
()
const
{
...
...
@@ -114,29 +136,18 @@ public:
return
static_cast
<
const
T
*>
(
filter_ptr
);
}
template
<
typename
T
>
const
T
*
filter
(
size_t
group_id
,
size_t
pack_group_size
=
1
_z
)
const
;
template
<
typename
T
>
const
T
*
bias
()
const
{
bias_type
.
assert_is_compatible_ctype
<
T
>
();
return
static_cast
<
const
T
*>
(
bias_ptr
);
}
template
<
typename
T
>
const
T
*
bias
(
size_t
batch_id
,
size_t
group_id
,
size_t
group_pack_size
=
1
_z
)
const
;
template
<
typename
T
>
T
*
dst
()
const
{
dst_type
.
assert_is_compatible_ctype
<
T
>
();
return
static_cast
<
T
*>
(
dst_ptr
);
}
template
<
typename
T
>
T
*
dst
(
size_t
batch_id
,
size_t
group_id
,
size_t
group_pack_size
=
1
_z
)
const
;
template
<
typename
T
>
T
*
workspace
()
const
{
return
static_cast
<
T
*>
(
workspace_ptr
);
...
...
dnn/src/fallback/convolution/algos.cpp
浏览文件 @
ea93927d
...
...
@@ -197,9 +197,12 @@ ConvolutionImpl::AlgoFallback::dispatch_kern(
auto
kern_fallback
=
[
workspace_per_thread
](
const
NCBKernParam
&
p
,
const
NCBKernIndex
&
ncb_index
)
{
UNPACK_CONV_F32_NCB_KERN_SIZES
(
p
);
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
MEGDNN_MARK_USED_VAR
(
N
);
auto
src
=
p
.
src
<
float
>
(),
filter
=
p
.
filter
<
float
>
();
auto
dst
=
p
.
dst
<
float
>
();
auto
src
=
p
.
src
<
float
>
(
batch_id
,
group_id
),
filter
=
p
.
filter
<
float
>
(
group_id
);
auto
dst
=
p
.
dst
<
float
>
(
batch_id
,
group_id
);
size_t
thread_id
=
ncb_index
.
thread_id
;
void
*
workspace_ptr
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
p
.
workspace_ptr
)
+
...
...
dnn/src/fallback/convolution/algos.h
浏览文件 @
ea93927d
...
...
@@ -20,16 +20,23 @@ namespace fallback {
template
<
typename
ST
,
typename
DT
,
typename
CT
>
void
kern_naive_forward
(
const
ConvolutionImpl
::
NCBKernParam
&
p
,
const
ConvolutionImpl
::
NCBKernIndex
&
/*index*/
)
{
const
ConvolutionImpl
::
NCBKernIndex
&
ncb_index
)
{
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
auto
IC
=
p
.
filter_meta
.
icpg
,
IH
=
p
.
isz
[
0
],
IW
=
p
.
isz
[
1
],
OC
=
p
.
filter_meta
.
ocpg
,
OH
=
p
.
osz
[
0
],
OW
=
p
.
osz
[
1
];
ptrdiff_t
fstrd
=
p
.
filter_meta
.
icpg
*
p
.
filter_meta
.
ocpg
*
p
.
filter_meta
.
spatial
[
0
]
*
p
.
filter_meta
.
spatial
[
1
]
*
p
.
filter_type
.
size
();
ptrdiff_t
istrd
=
p
.
filter_meta
.
icpg
*
p
.
src_type
.
size
();
ptrdiff_t
ostrd
=
p
.
filter_meta
.
ocpg
*
p
.
dst_type
.
size
();
TensorND
src
,
dst
;
src
.
raw_ptr
=
const_cast
<
void
*>
(
p
.
src_ptr
);
dst
.
raw_ptr
=
p
.
dst_ptr
;
src
.
layout
.
dtype
=
p
.
src_type
;
dst
.
layout
.
dtype
=
p
.
dst_type
;
if
(
p
.
filter_meta
.
format
==
param
::
Convolution
::
Format
::
NCHW
)
{
istrd
*=
p
.
isz
[
0
]
*
p
.
isz
[
1
];
ostrd
*=
p
.
osz
[
0
]
*
p
.
osz
[
1
];
src
.
layout
.
init_contiguous_stride
({
1
,
IC
,
IH
,
IW
});
dst
.
layout
.
init_contiguous_stride
({
1
,
OC
,
OH
,
OW
});
}
else
{
...
...
@@ -41,9 +48,17 @@ void kern_naive_forward(const ConvolutionImpl::NCBKernParam& p,
src
.
layout
.
init_contiguous_stride
({
1
,
IH
,
IW
,
IC
});
dst
.
layout
.
init_contiguous_stride
({
1
,
OH
,
OW
,
OC
});
}
src
.
raw_ptr
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
p
.
src_ptr
)
+
batch_id
*
p
.
inp_bs
*
p
.
src_type
.
size
()
+
group_id
*
istrd
);
dst
.
raw_ptr
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
p
.
dst_ptr
)
+
batch_id
*
p
.
out_bs
*
p
.
dst_type
.
size
()
+
group_id
*
ostrd
);
ST
*
filter
=
reinterpret_cast
<
ST
*>
(
reinterpret_cast
<
uintptr_t
>
(
p
.
filter_ptr
)
+
group_id
*
fstrd
);
std
::
copy
(
p
.
inp_s
,
p
.
inp_s
+
4
,
src
.
layout
.
stride
);
std
::
copy
(
p
.
out_s
,
p
.
out_s
+
4
,
dst
.
layout
.
stride
);
naive
::
convolution
::
forward
<
ST
,
ST
,
DT
,
CT
>
(
src
,
p
.
filter
<
ST
>
()
,
dst
,
naive
::
convolution
::
forward
<
ST
,
ST
,
DT
,
CT
>
(
src
,
filter
,
dst
,
p
.
filter_meta
);
}
...
...
dnn/src/fallback/convolution/opr_impl.cpp
浏览文件 @
ea93927d
...
...
@@ -189,41 +189,15 @@ ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param(
void
ConvolutionImpl
::
exec_with_ncb_kern
(
const
NCBKernParam
&
param
,
Algorithm
*
algo
)
{
auto
kerns
=
ncb_algo_dispatch_kern
(
algo
,
param
);
size_t
src_batch_stride
=
param
.
inp_bs
*
param
.
src_type
.
size
();
size_t
dst_batch_stride
=
param
.
out_bs
*
param
.
dst_type
.
size
();
auto
group
=
param
.
filter_meta
.
group
;
auto
fallback_handle
=
handle
();
for
(
auto
kernel
:
kerns
)
{
megdnn_assert
(
param
.
filter_meta
.
format
==
Param
::
Format
::
NCHW
||
param
.
filter_meta
.
format
==
Param
::
Format
::
NHWC
||
param
.
filter_meta
.
format
==
Param
::
Format
::
NCHW88
,
"invalid conv format"
);
ptrdiff_t
istrd
=
0
,
fstrd
=
0
,
ostrd
=
0
;
fstrd
=
param
.
filter_meta
.
icpg
*
param
.
filter_meta
.
ocpg
*
param
.
filter_meta
.
spatial
[
0
]
*
param
.
filter_meta
.
spatial
[
1
]
*
param
.
filter_type
.
size
();
istrd
=
param
.
filter_meta
.
icpg
*
param
.
src_type
.
size
();
ostrd
=
param
.
filter_meta
.
ocpg
*
param
.
dst_type
.
size
();
if
(
param
.
filter_meta
.
format
==
Param
::
Format
::
NCHW
)
{
istrd
*=
param
.
isz
[
0
]
*
param
.
isz
[
1
];
ostrd
*=
param
.
osz
[
0
]
*
param
.
osz
[
1
];
}
else
{
// must be NHWC. No action performed.
}
auto
run
=
[
=
](
size_t
index
,
size_t
thread_id
)
{
auto
copy_param
=
param
;
auto
run
=
[
param
,
kernel
](
size_t
index
,
size_t
thread_id
)
{
CpuNDRange
ndrange_id
(
kernel
.
global_size
,
index
);
size_t
group_id
=
ndrange_id
[
0
];
size_t
batch_id
=
ndrange_id
[
1
];
megdnn_assert
(
group_id
<
group
,
"The group id should smaller than gruop"
);
//! The kernel ptr point to batch index
incr_ptr
(
copy_param
.
src_ptr
,
group_id
*
istrd
+
batch_id
*
src_batch_stride
);
incr_ptr
(
copy_param
.
filter_ptr
,
group_id
*
fstrd
);
incr_ptr
(
copy_param
.
dst_ptr
,
group_id
*
ostrd
+
batch_id
*
dst_batch_stride
);
kernel
.
kern
(
copy_param
,
{
thread_id
,
ndrange_id
});
kernel
.
kern
(
param
,
{
thread_id
,
ndrange_id
});
};
static_cast
<
naive
::
HandleImpl
*>
(
fallback_handle
)
->
dispatch_kern
(
run
,
kernel
.
global_size
.
total_size
());
...
...
dnn/src/fallback/convolution/opr_impl.h
浏览文件 @
ea93927d
...
...
@@ -100,6 +100,43 @@ public:
T
*
workspace
()
const
{
return
static_cast
<
T
*>
(
workspace_ptr
);
}
//! when format is nchwxx and channel wise, multi group will pack into
//! one group_pack_id. group_pack_size is the number of packed group
//! together, like weight shape is {g/8, 1, 1, Fh, Fw, 8}
template
<
typename
T
>
T
*
dst
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
group_pack_size
=
1
_z
)
const
{
size_t
batch_offset
=
batch_id
*
out_bs
*
dst_type
.
size
();
size_t
group_offset
=
group_pack_size
*
group_pack_id
*
filter_meta
.
ocpg
*
osz
[
0
]
*
osz
[
1
]
*
dst_type
.
size
();
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
dst_ptr
)
+
batch_offset
+
group_offset
);
}
template
<
typename
T
>
const
T
*
src
(
size_t
batch_id
,
size_t
group_pack_id
,
size_t
group_pack_size
=
1
_z
)
const
{
size_t
batch_offset
=
batch_id
*
inp_bs
*
src_type
.
size
();
size_t
group_offset
=
group_pack_size
*
group_pack_id
*
filter_meta
.
icpg
*
isz
[
0
]
*
isz
[
1
]
*
src_type
.
size
();
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
src_ptr
)
+
batch_offset
+
group_offset
);
}
template
<
typename
T
>
const
T
*
filter
(
size_t
group_pack_id
,
size_t
pack_group_size
=
1
_z
)
const
{
size_t
group_offset
=
pack_group_size
*
group_pack_id
*
filter_meta
.
icpg
*
filter_meta
.
ocpg
*
filter_meta
.
spatial
[
0
]
*
filter_meta
.
spatial
[
1
]
*
filter_type
.
size
();
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
ptrdiff_t
>
(
filter_ptr
)
+
group_offset
);
}
};
static
void
*
const
sm_fallback_conv_algo_type
;
...
...
dnn/src/x86/conv_bias/f32/algos.cpp
浏览文件 @
ea93927d
...
...
@@ -74,25 +74,27 @@ void get_rectified_size(size_t IH, size_t IW, size_t OH, size_t OW, size_t FH,
size_t OC = fm.ocpg; \
WorkspaceBundle bundle = wbundle; \
for (size_t ic = 0; ic < IC; ic++) { \
copy_padding_kern( \
bundle, kern_param, \
{ncb_index.thread_id, {ncb_index.thread_id, 0, ic}}); \
copy_padding_kern(bundle, kern_param, ncb_index, \
{ncb_index.thread_id, 0, ic}); \
} \
for (size_t oc = 0; oc < OC; oc++) { \
do_conv_kern( \
bundle, kern_param, \
{ncb_index.thread_id, {ncb_index.thread_id, 0, oc}}); \
do_conv_kern(bundle, kern_param, ncb_index, \
{ncb_index.thread_id, 0, oc}); \
} \
}; \
ret_kerns.push_back({exec_one_group, {group, N, 1_z}}); \
} else { \
WorkspaceBundle bundle = wbundle; \
auto copy_padding = \
std::bind(copy_padding_kern, bundle, std::placeholders::_1, \
std::placeholders::_2); \
auto copy_padding = [wbundle](const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) { \
copy_padding_kern(wbundle, kern_param, ncb_index, \
ncb_index.ndrange_id); \
}; \
ret_kerns.push_back({copy_padding, {group, N, IC}}); \
auto do_conv = std::bind(do_conv_kern, bundle, std::placeholders::_1, \
std::placeholders::_2); \
auto do_conv = [wbundle](const NCBKernParam& kern_param, \
const NCBKernIndex& ncb_index) { \
do_conv_kern(wbundle, kern_param, ncb_index, \
ncb_index.ndrange_id); \
}; \
ret_kerns.push_back({do_conv, {group, N, OC}}); \
} \
return ret_kerns;
...
...
@@ -145,7 +147,8 @@ size_t ConvBiasImpl::AlgoDirect::get_workspace(
//! Process one input channel copy padding
void
ConvBiasImpl
::
AlgoDirect
::
copy_padding_kern
(
WorkspaceBundle
bundle
,
const
ConvBiasImpl
::
NCBKernParam
&
kern_param
,
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
)
{
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
)
{
size_t
IH
=
kern_param
.
isz
[
0
];
size_t
IW
=
kern_param
.
isz
[
1
];
size_t
IC
=
kern_param
.
filter_meta
.
icpg
;
...
...
@@ -160,14 +163,18 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern(
get_rectified_img_size
(
IH
,
IW
,
FH
,
FW
,
OH
,
OW
,
PH
,
PW
,
IH2
,
IW2
,
OH2
,
OW2
);
bool
rectify_src
=
(
IH
!=
IH2
||
IW
!=
IW2
);
size_t
padding_group_size
=
IH2
*
IW2
*
IC
;
const
float
*
sptr
=
static_cast
<
const
float
*>
(
kern_param
.
src_ptr
)
+
ncb_index
.
ndrange_id
[
2
]
*
IH
*
IW
;
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
size_t
channel_id
=
workspace_ids
[
2
];
const
float
*
sptr
=
static_cast
<
const
float
*>
(
kern_param
.
src
<
float
>
(
batch_id
,
group_id
))
+
channel_id
*
IH
*
IW
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
//! Used for get the workspace offset
size_t
workspace_group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_batch_id
=
ncb_index
.
ndrange_id
[
1
],
workspace_channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
workspace_group_id
=
workspace_ids
[
0
],
workspace_batch_id
=
workspace_ids
[
1
],
workspace_channel_id
=
workspace_ids
[
2
];
//! If large group, each thread has its own worspace, set group_id with
//! thread_id
if
(
rectify_src
)
{
...
...
@@ -234,7 +241,8 @@ void ConvBiasImpl::AlgoDirect::copy_padding_kern(
//! compute one output channel
void
ConvBiasImpl
::
AlgoDirect
::
do_conv_kern
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
const
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
)
{
size_t
OH
=
kern_param
.
osz
[
0
];
size_t
OW
=
kern_param
.
osz
[
1
];
size_t
IH
=
kern_param
.
isz
[
0
];
...
...
@@ -265,14 +273,16 @@ void ConvBiasImpl::AlgoDirect::do_conv_kern(WorkspaceBundle bundle,
megdnn
::
BiasMode
::
BROADCAST_CHANNEL_BIAS
)
{
bias_offset
=
1
_z
;
}
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
//! Used for get the workspace offset
size_t
workspace_group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_batch_id
=
ncb_index
.
ndrange_id
[
1
],
oc
=
ncb_index
.
ndrange_id
[
2
]
;
const
float
*
sptr
=
kern_param
.
src
<
float
>
()
;
const
float
*
filter
=
kern_param
.
filter
<
float
>
()
+
oc
*
FH
*
FW
*
IC
;
const
float
*
bias_ptr
=
kern_param
.
bias
<
float
>
(
)
+
oc
*
bias_offset
;
float
*
dst
=
kern_param
.
dst
<
float
>
()
+
oc
*
OH
*
OW
;
size_t
workspace_group_id
=
workspace_ids
[
0
],
workspace_batch_id
=
workspace_ids
[
1
],
oc
=
workspace_ids
[
2
];
const
float
*
sptr
=
kern_param
.
src
<
float
>
(
batch_id
,
group_id
)
;
const
float
*
filter
=
kern_param
.
filter
<
float
>
(
group_id
)
+
oc
*
FH
*
FW
*
IC
;
const
float
*
bias_ptr
=
kern_param
.
bias
<
float
>
(
batch_id
,
group_id
)
+
oc
*
bias_offset
;
float
*
dst
=
kern_param
.
dst
<
float
>
(
batch_id
,
group_id
)
+
oc
*
OH
*
OW
;
if
(
rectify_src
)
{
sptr
=
static_cast
<
float
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
padding_group_size
+
...
...
@@ -358,7 +368,8 @@ size_t ConvBiasImpl::AlgoDirectStride2::get_workspace(
//! Process one input channel copy padding
void
ConvBiasImpl
::
AlgoDirectStride2
::
copy_padding_kern
(
WorkspaceBundle
bundle
,
const
ConvBiasImpl
::
NCBKernParam
&
kern_param
,
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
)
{
const
ConvBiasImpl
::
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
)
{
size_t
IH
=
kern_param
.
isz
[
0
];
size_t
IW
=
kern_param
.
isz
[
1
];
size_t
IC
=
kern_param
.
filter_meta
.
icpg
;
...
...
@@ -373,13 +384,17 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern(
get_rectified_size
(
IH
,
IW
,
OH
,
OW
,
FH
,
FW
,
PH
,
PW
,
IH2
,
IW2
,
OH2
,
OW2
);
bool
rectify_src
=
need_src_copy
(
kern_param
);
size_t
padding_group_size
=
IH2
*
IW2
*
IC
;
const
float
*
sptr
=
static_cast
<
const
float
*>
(
kern_param
.
src_ptr
)
+
ncb_index
.
ndrange_id
[
2
]
*
IH
*
IW
;
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
size_t
channel_id
=
workspace_ids
[
2
];
const
float
*
sptr
=
static_cast
<
const
float
*>
(
kern_param
.
src
<
float
>
(
batch_id
,
group_id
))
+
channel_id
*
IH
*
IW
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
//! Used for get the workspace offset
size_t
workspace_group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_batch_id
=
ncb_index
.
ndrange_id
[
1
],
workspace_channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
workspace_group_id
=
workspace_ids
[
0
],
workspace_batch_id
=
workspace_ids
[
1
],
workspace_channel_id
=
workspace_ids
[
2
];
if
(
rectify_src
)
{
//! copy to sptr_base to eliminate padding effect
float
*
sptr_base
=
static_cast
<
float
*>
(
bundle
.
get
(
0
))
+
...
...
@@ -397,7 +412,7 @@ void ConvBiasImpl::AlgoDirectStride2::copy_padding_kern(
//! compute one output channel
void
ConvBiasImpl
::
AlgoDirectStride2
::
do_conv_kern
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
)
{
const
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
)
{
size_t
OH
=
kern_param
.
osz
[
0
];
size_t
OW
=
kern_param
.
osz
[
1
];
size_t
IH
=
kern_param
.
isz
[
0
];
...
...
@@ -439,14 +454,17 @@ void ConvBiasImpl::AlgoDirectStride2::do_conv_kern(
megdnn
::
BiasMode
::
BROADCAST_CHANNEL_BIAS
)
{
bias_offset
=
1
_z
;
}
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
//! Used for get the workspace offset
size_t
workspace_group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_batch_id
=
ncb_index
.
ndrange_id
[
1
],
oc
=
ncb_index
.
ndrange_id
[
2
];
const
float
*
sptr
=
kern_param
.
src
<
float
>
();
const
float
*
filter
=
kern_param
.
filter
<
float
>
()
+
oc
*
FH
*
FW
*
IC
;
const
float
*
bias_ptr
=
kern_param
.
bias
<
float
>
()
+
oc
*
bias_offset
;
float
*
dst
=
kern_param
.
dst
<
float
>
()
+
oc
*
OH
*
OW
;
size_t
workspace_group_id
=
workspace_ids
[
0
],
workspace_batch_id
=
workspace_ids
[
1
],
oc
=
workspace_ids
[
2
];
const
float
*
sptr
=
kern_param
.
src
<
float
>
(
batch_id
,
group_id
);
const
float
*
filter
=
kern_param
.
filter
<
float
>
(
group_id
)
+
oc
*
FH
*
FW
*
IC
;
const
float
*
bias_ptr
=
kern_param
.
bias
<
float
>
(
batch_id
,
group_id
)
+
oc
*
bias_offset
;
float
*
dst
=
kern_param
.
dst
<
float
>
(
batch_id
,
group_id
)
+
oc
*
OH
*
OW
;
if
(
rectify_src
)
{
sptr
=
static_cast
<
float
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
padding_group_size
+
...
...
@@ -547,23 +565,22 @@ MatrixMul* ConvBiasImpl::AlgoMatrixMul::get_matmul_opr() {
}
void
ConvBiasImpl
::
AlgoMatrixMul
::
kimpl
(
const
NCBKernParam
&
param
,
const
NCBKernIndex
&
)
{
const
NCBKernIndex
&
ncb_index
)
{
UNPACK_CONV_F32_NCB_KERN_SIZES
(
param
);
auto
IH2
=
IH
+
2
*
PH
;
auto
IW2
=
IW
+
2
*
PW
;
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
bool
is_xcorr
=
!
param
.
filter_meta
.
should_flip
;
auto
bundle
=
get_bundle
(
param
);
bundle
.
set
(
param
.
workspace_ptr
);
// workspace = tmp..src2
for
(
size_t
n
=
0
;
n
<
N
;
++
n
)
{
float
*
src
=
const_cast
<
float
*>
(
param
.
src
<
float
>
())
+
n
*
param
.
inp_bs
;
float
*
dst
=
param
.
dst
<
float
>
()
+
n
*
param
.
out_bs
;
float
*
bias_ptr
=
static_cast
<
float
*>
(
const_cast
<
void
*>
(
param
.
bias_ptr
));
if
(
param
.
bias_mode
==
megdnn
::
BiasMode
::
BIAS
)
{
bias_ptr
+=
n
*
param
.
out_bs
;
}
float
*
src
=
const_cast
<
float
*>
(
param
.
src
<
float
>
(
n
,
group_id
));
float
*
dst
=
param
.
dst
<
float
>
(
n
,
group_id
);
float
*
bias_ptr
=
static_cast
<
float
*>
(
const_cast
<
void
*>
(
param
.
bias
<
void
>
(
n
,
group_id
)));
float
*
B
,
*
src2
;
if
(
FH
==
1
&&
FW
==
1
&&
SH
==
1
&&
SW
==
1
&&
PH
==
0
&&
PW
==
0
)
{
// special case: 1x1
...
...
@@ -613,7 +630,7 @@ void ConvBiasImpl::AlgoMatrixMul::kimpl(const NCBKernParam& param,
{
TensorND
A_
,
B_
,
C_
;
A_
.
layout
=
TensorLayout
({
OC
,
IC
*
FH
*
FW
},
dtype
::
Float32
());
A_
.
raw_ptr
=
const_cast
<
float
*>
(
param
.
filter
<
float
>
());
A_
.
raw_ptr
=
const_cast
<
float
*>
(
param
.
filter
<
float
>
(
group_id
));
B_
.
layout
=
TensorLayout
({
IC
*
FH
*
FW
,
OH
*
OW
},
dtype
::
Float32
());
B_
.
raw_ptr
=
B
;
C_
.
layout
=
TensorLayout
({
OC
,
OH
*
OW
},
dtype
::
Float32
());
...
...
dnn/src/x86/conv_bias/f32/algos.h
浏览文件 @
ea93927d
...
...
@@ -22,10 +22,12 @@ class ConvBiasImpl::AlgoDirect final : public AlgoBase {
static
void
copy_padding_kern
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
);
const
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
);
static
void
do_conv_kern
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
);
const
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
);
bool
m_large_group
;
public:
...
...
@@ -57,10 +59,12 @@ class ConvBiasImpl::AlgoDirectStride2 final : public AlgoBase {
static
void
copy_padding_kern
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
);
const
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
);
static
void
do_conv_kern
(
WorkspaceBundle
bundle
,
const
NCBKernParam
&
kern_param
,
const
NCBKernIndex
&
ncb_index
);
const
NCBKernIndex
&
ncb_index
,
const
CpuNDRange
&
workspace_ids
);
bool
m_large_group
;
public:
...
...
dnn/src/x86/conv_bias/int8/algos.cpp
浏览文件 @
ea93927d
...
...
@@ -146,9 +146,11 @@ WorkspaceBundle ConvBiasImpl::AlgoMkldnnQint8::get_bundle(
} while (0)
void
ConvBiasImpl
::
AlgoMkldnnQint8
::
kern_mkldnn_s8x8x32
(
const
NCBKernParam
&
param
,
const
NCBKernIndex
&
)
{
const
NCBKernParam
&
param
,
const
NCBKernIndex
&
ncb_index
)
{
UNPACK_CONV_F32_NCB_KERN_SIZES
(
param
);
MEGDNN_MARK_USED_VAR
(
N
);
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
size_t
batch_id
=
ncb_index
.
ndrange_id
[
1
];
auto
x86_handle
=
static_cast
<
HandleImpl
*>
(
inplace_cpu_handle
().
get
());
megdnn_assert
(
x86_handle
!=
nullptr
,
"x86 handle can not be null"
);
auto
eng_mkldnn
=
x86_handle
->
mkldnn_engine
();
...
...
@@ -167,10 +169,11 @@ void ConvBiasImpl::AlgoMkldnnQint8::kern_mkldnn_s8x8x32(
auto
megdnn_dst_md
=
memory
::
desc
({
dst_shape
},
memory
::
data_type
::
s32
,
memory
::
format_tag
::
nchw
);
auto
megdnn_weight_memory
=
memory
(
megdnn_weight_md
,
eng_mkldnn
,
const_cast
<
void
*>
(
param
.
filter_ptr
));
int8_t
*
src
=
const_cast
<
int8_t
*>
(
param
.
src
<
int8_t
>
());
int32_t
*
dst
=
param
.
dst
<
int32_t
>
();
auto
megdnn_weight_memory
=
memory
(
megdnn_weight_md
,
eng_mkldnn
,
const_cast
<
void
*>
(
param
.
filter
<
void
>
(
group_id
)));
int8_t
*
src
=
const_cast
<
int8_t
*>
(
param
.
src
<
int8_t
>
(
batch_id
,
group_id
));
int32_t
*
dst
=
param
.
dst
<
int32_t
>
(
batch_id
,
group_id
);
auto
megdnn_src_memory
=
memory
(
megdnn_src_md
,
eng_mkldnn
,
static_cast
<
void
*>
(
src
));
...
...
@@ -353,18 +356,18 @@ MatrixMul* ConvBiasImpl::AlgoMkldnnMatmulQint8::get_matmul_opr() {
}
void
ConvBiasImpl
::
AlgoMkldnnMatmulQint8
::
kern_mkldnn_matmul_s8x8x32
(
const
NCBKernParam
&
param
,
const
NCBKernIndex
&
)
{
const
NCBKernParam
&
param
,
const
NCBKernIndex
&
ncb_index
)
{
UNPACK_CONV_F32_NCB_KERN_SIZES
(
param
);
auto
IH2
=
IH
+
2
*
PH
;
auto
IW2
=
IW
+
2
*
PW
;
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
];
bool
is_xcorr
=
!
param
.
filter_meta
.
should_flip
;
auto
bundle
=
get_bundle
(
param
);
bundle
.
set
(
param
.
workspace_ptr
);
for
(
size_t
n
=
0
;
n
<
N
;
++
n
)
{
int8_t
*
src
=
const_cast
<
int8_t
*>
(
param
.
src
<
int8_t
>
())
+
n
*
param
.
inp_bs
;
int32_t
*
dst
=
param
.
dst
<
int32_t
>
()
+
n
*
param
.
out_bs
;
int8_t
*
src
=
const_cast
<
int8_t
*>
(
param
.
src
<
int8_t
>
(
n
,
group_id
));
int32_t
*
dst
=
param
.
dst
<
int32_t
>
(
n
,
group_id
);
int8_t
*
B
,
*
src2
;
if
(
FH
==
1
&&
FW
==
1
&&
SH
==
1
&&
SW
==
1
&&
PH
==
0
&&
PW
==
0
)
{
// special case: 1x1
...
...
@@ -414,7 +417,7 @@ void ConvBiasImpl::AlgoMkldnnMatmulQint8::kern_mkldnn_matmul_s8x8x32(
{
TensorND
A_
,
B_
,
C_
;
A_
.
layout
=
TensorLayout
({
OC
,
IC
*
FH
*
FW
},
dtype
::
Int8
());
A_
.
raw_ptr
=
const_cast
<
int8_t
*>
(
param
.
filter
<
int8_t
>
());
A_
.
raw_ptr
=
const_cast
<
int8_t
*>
(
param
.
filter
<
int8_t
>
(
group_id
));
B_
.
layout
=
TensorLayout
({
IC
*
FH
*
FW
,
OH
*
OW
},
dtype
::
Int8
());
B_
.
raw_ptr
=
B
;
C_
.
layout
=
TensorLayout
({
OC
,
OH
*
OW
},
dtype
::
Int32
());
...
...
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride1.cpp
浏览文件 @
ea93927d
...
...
@@ -47,8 +47,8 @@ void pack_src_conv_avx2_stride1(WorkspaceBundle bundle,
batch_id
=
ncb_index
.
ndrange_id
[
1
],
channel_id
=
ncb_index
.
ndrange_id
[
2
];
const
int8_t
*
src_ptr
=
kern_param
.
src
<
int8_t
>
()
+
ic_step
*
channel_id
*
c_stride
;
const
int8_t
*
src_ptr
=
kern_param
.
src
<
int8_t
>
(
batch_id
,
group_id
)
+
ic_step
*
channel_id
*
c_stride
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
int8_t
*
packed_src
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
batch_id
*
group
*
packed_group_size
+
...
...
@@ -129,7 +129,7 @@ static inline void pack_filter_conv_avx2_stride1(
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
oc_index_id
=
ncb_index
.
ndrange_id
[
1
];
const
int8_t
*
pack_filter_ptr
=
kern_param
.
filter
<
int8_t
>
();
const
int8_t
*
pack_filter_ptr
=
kern_param
.
filter
<
int8_t
>
(
group_id
);
bundle
.
set
(
kern_param
.
workspace_ptr
);
int16_t
*
out_ptr
=
static_cast
<
int16_t
*>
(
bundle
.
get
(
1
))
+
group_id
*
round_up
(
oc
,
oc_step
)
*
oc_out_stride
;
...
...
@@ -632,19 +632,18 @@ void do_conv_kern(WorkspaceBundle bundle,
const
uint32_t
packed_group_size
=
div_ceil
(
ic
,
ic_step
)
*
pack_ih
*
pack_iw
;
size_t
workspace_
group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_
batch_id
=
ncb_index
.
ndrange_id
[
1
],
workspace_
channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
],
channel_id
=
ncb_index
.
ndrange_id
[
2
];
bundle
.
set
(
kern_param
.
workspace_ptr
);
int8_t
*
src_ptr
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
packed_group_size
+
workspace_batch_id
*
group
*
packed_group_size
;
int16_t
*
filter_ptr
=
static_cast
<
int16_t
*>
(
bundle
.
get
(
1
))
+
workspace_group_id
*
round_up
(
oc
,
oc_step
)
*
filter_round_size
+
oc_step
*
workspace_channel_id
*
filter_round_size
;
group_id
*
packed_group_size
+
batch_id
*
group
*
packed_group_size
;
int16_t
*
filter_ptr
=
static_cast
<
int16_t
*>
(
bundle
.
get
(
1
))
+
group_id
*
round_up
(
oc
,
oc_step
)
*
filter_round_size
+
oc_step
*
channel_id
*
filter_round_size
;
bool
need_post_process
=
kern_param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
...
...
@@ -652,12 +651,11 @@ void do_conv_kern(WorkspaceBundle bundle,
int32_t
*
dst_tptr
=
nullptr
;
if
(
need_post_process
)
{
dst_tptr
=
static_cast
<
int32_t
*>
(
bundle
.
get
(
2
))
+
workspace_batch_id
*
group
*
oc
*
oc_stride
+
workspace_group_id
*
oc
*
oc_stride
+
oc_step
*
workspace_channel_id
*
oh
*
ow
;
batch_id
*
group
*
oc
*
oc_stride
+
group_id
*
oc
*
oc_stride
+
oc_step
*
channel_id
*
oh
*
ow
;
}
else
{
dst_tptr
=
kern_param
.
dst
<
int32_t
>
()
+
oc_step
*
workspace_
channel_id
*
oh
*
ow
;
dst_tptr
=
kern_param
.
dst
<
int32_t
>
(
batch_id
,
group_id
)
+
oc_step
*
channel_id
*
oh
*
ow
;
}
const
uint32_t
oc_end
=
oc
/
oc_step
*
oc_step
;
...
...
@@ -666,7 +664,7 @@ void do_conv_kern(WorkspaceBundle bundle,
const
uint32_t
oh_remain
=
oh
-
oh_end
;
const
uint32_t
ow_end
=
ow
/
ow_step
*
ow_step
;
const
uint32_t
ow_remain
=
ow
-
ow_end
;
const
uint32_t
oc_index
=
oc_step
*
workspace_
channel_id
;
const
uint32_t
oc_index
=
oc_step
*
channel_id
;
AlgoAVX2DirectConvStride1S8S8S32_forward
<
oc_step
,
ic_step
,
oh_step
,
ow_step
>
(
...
...
@@ -684,29 +682,29 @@ void do_post_process(WorkspaceBundle bundle,
const
uint32_t
oh
=
kern_param
.
osz
[
0
];
const
uint32_t
ow
=
kern_param
.
osz
[
1
];
size_t
workspace_
group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_
batch_id
=
ncb_index
.
ndrange_id
[
1
];
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
];
bundle
.
set
(
kern_param
.
workspace_ptr
);
bool
need_post_process
=
kern_param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
void
*
dst_tptr
=
nullptr
;
if
(
need_post_process
)
{
dst_tptr
=
static_cast
<
int32_t
*>
(
bundle
.
get
(
2
))
+
workspace_batch_id
*
group
*
oc
*
oh
*
ow
+
workspace_group_id
*
oc
*
oh
*
ow
;
batch_id
*
group
*
oc
*
oh
*
ow
+
group_id
*
oc
*
oh
*
ow
;
}
else
{
dst_tptr
=
kern_param
.
dst
<
dt_int32
>
();
dst_tptr
=
kern_param
.
dst
<
dt_int32
>
(
batch_id
,
group_id
);
}
void
*
dst_ptr
=
kern_param
.
dst
<
void
>
(
batch_id
,
group_id
);
#define cb(_bias_ctype, _dst_ctype, _postprocess_mode) \
{ \
const dt_int32* bias_ptr = kern_param.bias<dt_int32>(); \
const dt_int32* bias_ptr = \
kern_param.bias<dt_int32>(batch_id, group_id); \
PostProcess<DTypeTrait<_bias_ctype>::ctype, \
DTypeTrait<_dst_ctype>::ctype, \
_postprocess_mode>::run(dst_tptr, \
const_cast<dt_int32*>(bias_ptr), \
kern_param.dst_ptr, \
kern_param.bias_mode, \
dst_ptr, kern_param.bias_mode, \
kern_param.nonlineMode, \
kern_param.bias_type, \
kern_param.dst_type, 1, oc, oh, \
...
...
dnn/src/x86/conv_bias/int8/avx2_direct_conv_stride2.cpp
浏览文件 @
ea93927d
...
...
@@ -45,8 +45,8 @@ void pack_src_conv_avx2_stride2(WorkspaceBundle bundle,
batch_id
=
ncb_index
.
ndrange_id
[
1
],
channel_id
=
ncb_index
.
ndrange_id
[
2
];
const
int8_t
*
src_ptr
=
kern_param
.
src
<
int8_t
>
()
+
ic_step
*
channel_id
*
c_stride
;
const
int8_t
*
src_ptr
=
kern_param
.
src
<
int8_t
>
(
batch_id
,
group_id
)
+
ic_step
*
channel_id
*
c_stride
;
bundle
.
set
(
kern_param
.
workspace_ptr
);
int8_t
*
packed_src
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
batch_id
*
group
*
packed_group_size
+
...
...
@@ -187,7 +187,7 @@ static inline void pack_filter_conv_avx2_stride2(
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
oc_index_id
=
ncb_index
.
ndrange_id
[
1
];
const
int8_t
*
pack_filter_ptr
=
kern_param
.
filter
<
int8_t
>
();
const
int8_t
*
pack_filter_ptr
=
kern_param
.
filter
<
int8_t
>
(
group_id
);
bundle
.
set
(
kern_param
.
workspace_ptr
);
int16_t
*
out_ptr
=
static_cast
<
int16_t
*>
(
bundle
.
get
(
1
))
+
group_id
*
round_up
(
oc
,
oc_step
)
*
oc_out_stride
;
...
...
@@ -705,18 +705,17 @@ void kernel_imp(WorkspaceBundle bundle,
const
uint32_t
packed_group_size
=
div_ceil
(
ic
,
ic_step
)
*
pack_ih
*
pack_iw
;
size_t
workspace_
group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_
batch_id
=
ncb_index
.
ndrange_id
[
1
],
workspace_
channel_id
=
ncb_index
.
ndrange_id
[
2
];
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
],
channel_id
=
ncb_index
.
ndrange_id
[
2
];
bundle
.
set
(
kern_param
.
workspace_ptr
);
int8_t
*
src_ptr
=
static_cast
<
int8_t
*>
(
bundle
.
get
(
0
))
+
workspace_group_id
*
packed_group_size
+
workspace_batch_id
*
group
*
packed_group_size
;
int16_t
*
filter_ptr
=
static_cast
<
int16_t
*>
(
bundle
.
get
(
1
))
+
workspace_group_id
*
round_up
(
oc
,
oc_step
)
*
filter_round_size
+
oc_step
*
workspace_channel_id
*
filter_round_size
;
group_id
*
packed_group_size
+
batch_id
*
group
*
packed_group_size
;
int16_t
*
filter_ptr
=
static_cast
<
int16_t
*>
(
bundle
.
get
(
1
))
+
group_id
*
round_up
(
oc
,
oc_step
)
*
filter_round_size
+
oc_step
*
channel_id
*
filter_round_size
;
bool
need_post_process
=
kern_param
.
dst_type
.
enumv
()
==
DTypeEnum
::
QuantizedS8
;
...
...
@@ -724,12 +723,11 @@ void kernel_imp(WorkspaceBundle bundle,
int32_t
*
dst_tptr
=
nullptr
;
if
(
need_post_process
)
{
dst_tptr
=
static_cast
<
int32_t
*>
(
bundle
.
get
(
2
))
+
workspace_batch_id
*
group
*
oc
*
oc_stride
+
workspace_group_id
*
oc
*
oc_stride
+
oc_step
*
workspace_channel_id
*
oh
*
ow
;
batch_id
*
group
*
oc
*
oc_stride
+
group_id
*
oc
*
oc_stride
+
oc_step
*
channel_id
*
oh
*
ow
;
}
else
{
dst_tptr
=
kern_param
.
dst
<
int32_t
>
()
+
oc_step
*
workspace_
channel_id
*
oh
*
ow
;
dst_tptr
=
kern_param
.
dst
<
int32_t
>
(
batch_id
,
group_id
)
+
oc_step
*
channel_id
*
oh
*
ow
;
}
const
uint32_t
oc_end
=
oc
/
oc_step
*
oc_step
;
const
uint32_t
oc_remain
=
oc
-
oc_end
;
...
...
@@ -737,7 +735,7 @@ void kernel_imp(WorkspaceBundle bundle,
const
uint32_t
oh_remain
=
oh
-
oh_end
;
const
uint32_t
ow_end
=
ow
/
ow_step
*
ow_step
;
const
uint32_t
ow_remain
=
ow
-
ow_end
;
const
uint32_t
oc_index
=
oc_step
*
workspace_
channel_id
;
const
uint32_t
oc_index
=
oc_step
*
channel_id
;
kernel_handle_oh_remain
<
oc_step
,
ic_step
,
oh_step
,
ow_step
>
(
oh_remain
,
oc_remain
,
ow_remain
,
filter_ptr
,
src_ptr
,
dst_tptr
,
...
...
@@ -754,8 +752,8 @@ void do_post_process(WorkspaceBundle bundle,
const
uint32_t
oh
=
kern_param
.
osz
[
0
];
const
uint32_t
ow
=
kern_param
.
osz
[
1
];
size_t
workspace_
group_id
=
ncb_index
.
ndrange_id
[
0
],
workspace_
batch_id
=
ncb_index
.
ndrange_id
[
1
];
size_t
group_id
=
ncb_index
.
ndrange_id
[
0
],
batch_id
=
ncb_index
.
ndrange_id
[
1
];
bundle
.
set
(
kern_param
.
workspace_ptr
);
bool
need_post_process
=
...
...
@@ -763,21 +761,22 @@ void do_post_process(WorkspaceBundle bundle,
void
*
dst_tptr
=
nullptr
;
if
(
need_post_process
)
{
dst_tptr
=
static_cast
<
int32_t
*>
(
bundle
.
get
(
2
))
+
workspace_
batch_id
*
group
*
oc
*
oh
*
ow
+
workspace_
group_id
*
oc
*
oh
*
ow
;
batch_id
*
group
*
oc
*
oh
*
ow
+
group_id
*
oc
*
oh
*
ow
;
}
else
{
dst_tptr
=
kern_param
.
dst
<
dt_int32
>
();
dst_tptr
=
kern_param
.
dst
<
dt_int32
>
(
batch_id
,
group_id
);
}
void
*
dst_ptr
=
kern_param
.
dst
<
void
>
(
batch_id
,
group_id
);
#define cb(_bias_ctype, _dst_ctype, _postprocess_mode) \
{ \
const dt_int32* bias_ptr = kern_param.bias<dt_int32>(); \
const dt_int32* bias_ptr = \
kern_param.bias<dt_int32>(batch_id, group_id); \
PostProcess<DTypeTrait<_bias_ctype>::ctype, \
DTypeTrait<_dst_ctype>::ctype, \
_postprocess_mode>::run(dst_tptr, \
const_cast<dt_int32*>(bias_ptr), \
kern_param.dst_ptr, \
kern_param.bias_mode, \
dst_ptr, kern_param.bias_mode, \
kern_param.nonlineMode, \
kern_param.bias_type, \
kern_param.dst_type, 1, oc, oh, \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录