Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
wjd2002
Ncnn
比较版本
e8645e9117cd926530c405b103b7afb984c7173b...55709708e998f21962763a40db1099630ef36458
N
Ncnn
项目概览
wjd2002
/
Ncnn
9 个月 前同步成功
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
N
Ncnn
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
源分支
55709708e998f21962763a40db1099630ef36458
选择Git版本
...
目标分支
e8645e9117cd926530c405b103b7afb984c7173b
选择Git版本
比较
Commits (5)
https://gitcode.net/wjd2002/ncnn/-/commit/a9a7be0e0a1938d6fca8c51c410f5867fdefb297
c_api: expose Mat border processing api (#4855)
2023-07-15T19:13:18+08:00
Mek101
mek101-dev.inv@slmail.me
https://gitcode.net/wjd2002/ncnn/-/commit/411a098d5e009a1ef5ab2791b63b6f92ddd9a05c
Expose layer_to_index in c-api (#4860)
2023-07-16T22:08:01+08:00
Mek101
mek101-dev.inv@slmail.me
https://gitcode.net/wjd2002/ncnn/-/commit/9f29a1737c07ad797c9ecf62140bdc966cd18a5f
c_api return null on null layer (#4865)
2023-07-18T13:03:29+08:00
Mek101
mek101-dev.inv@slmail.me
https://gitcode.net/wjd2002/ncnn/-/commit/2303b77ac17ac880860252152f26f8d058abc1ee
Update how-to-build.md (#4872)
2023-07-21T19:35:36+08:00
ฅ'ω'ฅ
1152383857@qq.com
https://gitcode.net/wjd2002/ncnn/-/commit/55709708e998f21962763a40db1099630ef36458
x86 optimization for convolution int8 packed unified elempack (#4861)
2023-07-22T22:01:37+08:00
nihui
nihuini@tencent.com
展开全部
隐藏空白更改
内联
并排
Showing
21 changed file
with
5647 addition
and
463 deletion
+5647
-463
.github/workflows/linux-aarch64-cpu-gcc.yml
.github/workflows/linux-aarch64-cpu-gcc.yml
+2
-2
docs/how-to-build/how-to-build.md
docs/how-to-build/how-to-build.md
+4
-2
src/c_api.cpp
src/c_api.cpp
+43
-2
src/c_api.h
src/c_api.h
+11
-0
src/layer/arm/convolution_3x3_winograd.h
src/layer/arm/convolution_3x3_winograd.h
+6
-6
src/layer/arm/convolution_3x3_winograd_bf16s.h
src/layer/arm/convolution_3x3_winograd_bf16s.h
+6
-6
src/layer/x86/convolution_int8.h
src/layer/x86/convolution_int8.h
+0
-82
src/layer/x86/convolution_pack1to4_int8.h
src/layer/x86/convolution_pack1to4_int8.h
+0
-89
src/layer/x86/convolution_pack8to1_int8.h
src/layer/x86/convolution_pack8to1_int8.h
+0
-96
src/layer/x86/convolution_pack8to4_int8.h
src/layer/x86/convolution_pack8to4_int8.h
+0
-130
src/layer/x86/convolution_packed_int8.h
src/layer/x86/convolution_packed_int8.h
+5386
-0
src/layer/x86/convolution_x86.cpp
src/layer/x86/convolution_x86.cpp
+10
-48
src/layer/x86/convolution_x86_avx2.cpp
src/layer/x86/convolution_x86_avx2.cpp
+12
-0
src/layer/x86/convolution_x86_avx512vnni.cpp
src/layer/x86/convolution_x86_avx512vnni.cpp
+7
-0
src/layer/x86/convolution_x86_avxvnni.cpp
src/layer/x86/convolution_x86_avxvnni.cpp
+7
-0
src/layer/x86/convolution_x86_xop.cpp
src/layer/x86/convolution_x86_xop.cpp
+7
-0
src/layer/x86/x86_usability.h
src/layer/x86/x86_usability.h
+54
-0
tests/test_convolution.cpp
tests/test_convolution.cpp
+4
-0
tests/test_convolution_1.cpp
tests/test_convolution_1.cpp
+4
-0
tests/test_convolution_2.cpp
tests/test_convolution_2.cpp
+4
-0
tests/test_convolution_3.cpp
tests/test_convolution_3.cpp
+80
-0
未找到文件。
.github/workflows/linux-aarch64-cpu-gcc.yml
浏览文件 @
55709708
...
...
@@ -155,7 +155,7 @@ jobs:
uses
:
actions/cache@v3
with
:
path
:
qemu-install
key
:
qemu-aarch64-install-202
20502-2
key
:
qemu-aarch64-install-202
30717
-
name
:
install-qemu-build-deps
if
:
steps.cache-qemu.outputs.cache-hit != 'true'
run
:
|
...
...
@@ -167,7 +167,7 @@ jobs:
with
:
repository
:
qemu/qemu
path
:
qemu
ref
:
f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
ref
:
ed8ad9728a9c0eec34db9dff61dfa2f1dd625637
-
name
:
qemu
if
:
steps.cache-qemu.outputs.cache-hit != 'true'
run
:
|
...
...
docs/how-to-build/how-to-build.md
浏览文件 @
55709708
...
...
@@ -147,14 +147,16 @@ Download and Install Visual Studio Community 2017 from https://visualstudio.micr
Start the command prompt:
`Start → Programs → Visual Studio 2017 → Visual Studio Tools → x64 Native Tools Command Prompt for VS 2017`
> You can also search `x64 Native Tools Command Prompt for VS 2017` directly.
Download protobuf-3.11.2 from https://github.com/google/protobuf/archive/v3.11.2.zip
Build protobuf library:
```
shell
cd
<protobuf-root-dir>
mkdir
build
cd
build
mkdir
protobuf_
build
cd
protobuf_
build
cmake
-A
x64
-DCMAKE_INSTALL_PREFIX
=
%cd%/install
-Dprotobuf_BUILD_TESTS
=
OFF
-Dprotobuf_MSVC_STATIC_RUNTIME
=
OFF ../cmake
cmake
--build
.
--config
Release
-j
2
cmake
--build
.
--config
Release
--target
install
...
...
src/c_api.cpp
浏览文件 @
55709708
...
...
@@ -1028,8 +1028,14 @@ ncnn_layer_t ncnn_layer_create()
ncnn_layer_t
ncnn_layer_create_by_typeindex
(
int
typeindex
)
{
void
*
pthis
=
(
void
*
)(
ncnn
::
create_layer
(
typeindex
));
if
(
!
pthis
)
{
return
0
;
}
ncnn_layer_t
layer
=
(
ncnn_layer_t
)
malloc
(
sizeof
(
__ncnn_layer_t
));
layer
->
pthis
=
(
void
*
)(
ncnn
::
create_layer
(
typeindex
))
;
layer
->
pthis
=
pthis
;
layer
->
load_param
=
__ncnn_layer_load_param
;
layer
->
load_model
=
__ncnn_layer_load_model
;
layer
->
create_pipeline
=
__ncnn_layer_create_pipeline
;
...
...
@@ -1044,8 +1050,14 @@ ncnn_layer_t ncnn_layer_create_by_typeindex(int typeindex)
#if NCNN_STRING
ncnn_layer_t
ncnn_layer_create_by_type
(
const
char
*
type
)
{
void
*
pthis
=
(
void
*
)(
ncnn
::
create_layer
(
type
));
if
(
!
pthis
)
{
return
0
;
}
ncnn_layer_t
layer
=
(
ncnn_layer_t
)
malloc
(
sizeof
(
__ncnn_layer_t
));
layer
->
pthis
=
(
void
*
)(
ncnn
::
create_layer
(
type
))
;
layer
->
pthis
=
pthis
;
layer
->
load_param
=
__ncnn_layer_load_param
;
layer
->
load_model
=
__ncnn_layer_load_model
;
layer
->
create_pipeline
=
__ncnn_layer_create_pipeline
;
...
...
@@ -1056,6 +1068,11 @@ ncnn_layer_t ncnn_layer_create_by_type(const char* type)
layer
->
forward_inplace_n
=
__ncnn_layer_forward_inplace_n
;
return
layer
;
}
int
ncnn_layer_type_to_index
(
const
char
*
type
)
{
return
ncnn
::
layer_to_index
(
type
);
}
#endif
/* NCNN_STRING */
void
ncnn_layer_destroy
(
ncnn_layer_t
layer
)
...
...
@@ -1417,6 +1434,30 @@ int ncnn_extractor_extract_index(ncnn_extractor_t ex, int index, ncnn_mat_t* mat
return
ret
;
}
void
ncnn_copy_make_border
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
int
type
,
float
v
,
const
ncnn_option_t
opt
)
{
const
Option
_opt
=
opt
?
*
((
const
Option
*
)
opt
)
:
Option
();
copy_make_border
(
*
(
const
Mat
*
)
src
,
*
(
Mat
*
)
dst
,
top
,
bottom
,
left
,
right
,
type
,
v
,
_opt
);
}
void
ncnn_copy_make_border_3d
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
int
front
,
int
behind
,
int
type
,
float
v
,
const
ncnn_option_t
opt
)
{
const
Option
_opt
=
opt
?
*
((
const
Option
*
)
opt
)
:
Option
();
copy_make_border_3d
(
*
(
const
Mat
*
)
src
,
*
(
Mat
*
)
dst
,
top
,
bottom
,
left
,
right
,
front
,
behind
,
type
,
v
,
_opt
);
}
void
ncnn_copy_cut_border
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
const
ncnn_option_t
opt
)
{
const
Option
_opt
=
opt
?
*
((
const
Option
*
)
opt
)
:
Option
();
copy_cut_border
(
*
(
const
Mat
*
)
src
,
*
(
Mat
*
)
dst
,
top
,
bottom
,
left
,
right
,
_opt
);
}
void
ncnn_copy_cut_border_3d
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
int
front
,
int
behind
,
const
ncnn_option_t
opt
)
{
const
Option
_opt
=
opt
?
*
((
const
Option
*
)
opt
)
:
Option
();
copy_cut_border_3d
(
*
(
const
Mat
*
)
src
,
*
(
Mat
*
)
dst
,
top
,
bottom
,
left
,
right
,
front
,
behind
,
_opt
);
}
#ifdef __cplusplus
}
/* extern "C" */
#endif
...
...
src/c_api.h
浏览文件 @
55709708
...
...
@@ -210,6 +210,7 @@ NCNN_EXPORT ncnn_layer_t ncnn_layer_create();
NCNN_EXPORT
ncnn_layer_t
ncnn_layer_create_by_typeindex
(
int
typeindex
);
#if NCNN_STRING
NCNN_EXPORT
ncnn_layer_t
ncnn_layer_create_by_type
(
const
char
*
type
);
NCNN_EXPORT
int
ncnn_layer_type_to_index
(
const
char
*
type
);
#endif
/* NCNN_STRING */
NCNN_EXPORT
void
ncnn_layer_destroy
(
ncnn_layer_t
layer
);
...
...
@@ -327,6 +328,16 @@ NCNN_EXPORT int ncnn_extractor_extract(ncnn_extractor_t ex, const char* name, nc
NCNN_EXPORT
int
ncnn_extractor_input_index
(
ncnn_extractor_t
ex
,
int
index
,
const
ncnn_mat_t
mat
);
NCNN_EXPORT
int
ncnn_extractor_extract_index
(
ncnn_extractor_t
ex
,
int
index
,
ncnn_mat_t
*
mat
);
/* mat process api */
#define NCNN_BORDER_CONSTANT 0
#define NCNN_BORDER_REPLICATE 1
#define NCNN_BORDER_REFLECT 2
#define NCNN_BORDER_TRANSPARENT -233
NCNN_EXPORT
void
ncnn_copy_make_border
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
int
type
,
float
v
,
const
ncnn_option_t
opt
);
NCNN_EXPORT
void
ncnn_copy_make_border_3d
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
int
front
,
int
behind
,
int
type
,
float
v
,
const
ncnn_option_t
opt
);
NCNN_EXPORT
void
ncnn_copy_cut_border
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
const
ncnn_option_t
opt
);
NCNN_EXPORT
void
ncnn_copy_cut_border_3d
(
const
ncnn_mat_t
src
,
ncnn_mat_t
dst
,
int
top
,
int
bottom
,
int
left
,
int
right
,
int
front
,
int
behind
,
const
ncnn_option_t
opt
);
#ifdef __cplusplus
}
/* extern "C" */
#endif
...
...
src/layer/arm/convolution_3x3_winograd.h
浏览文件 @
55709708
...
...
@@ -6302,9 +6302,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile(const Mat& bottom_b
float32x4x2_t
_t01
=
vzipq_f32
(
_t0
,
_t1
);
_r0
=
vget_low_f32
(
_t01
.
val
[
0
]);
if
(
tj
*
2
+
1
<
w
)
_r1
=
vget_high_f32
(
_t01
.
val
[
0
]);
if
(
tj
*
2
+
2
<
w
)
_r2
=
vget_low_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
2
+
3
<
w
)
_r3
=
vget_high_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
4
+
1
<
w
)
_r1
=
vget_high_f32
(
_t01
.
val
[
0
]);
if
(
tj
*
4
+
2
<
w
)
_r2
=
vget_low_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
4
+
3
<
w
)
_r3
=
vget_high_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
4
+
4
<
w
)
{
float
tmp
[
2
]
=
{
r0
[
4
],
r1
[
4
]};
...
...
@@ -8081,9 +8081,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile(const Mat& bottom_b
float32x4x2_t
_t01
=
vzipq_f32
(
_t0
,
_t1
);
_r0
=
vget_low_f32
(
_t01
.
val
[
0
]);
if
(
tj
*
2
+
1
<
w
)
_r1
=
vget_high_f32
(
_t01
.
val
[
0
]);
if
(
tj
*
2
+
2
<
w
)
_r2
=
vget_low_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
2
+
3
<
w
)
_r3
=
vget_high_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
6
+
1
<
w
)
_r1
=
vget_high_f32
(
_t01
.
val
[
0
]);
if
(
tj
*
6
+
2
<
w
)
_r2
=
vget_low_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
6
+
3
<
w
)
_r3
=
vget_high_f32
(
_t01
.
val
[
1
]);
if
(
tj
*
6
+
4
<
w
)
{
_t0
=
vld1q_f32
(
r0
+
4
);
...
...
src/layer/arm/convolution_3x3_winograd_bf16s.h
浏览文件 @
55709708
...
...
@@ -1540,9 +1540,9 @@ static inline void conv3x3s1_winograd43_transform_input_tile_bf16s(const Mat& bo
float32x4_t
_t1_fp32
=
bfloat2float
(
_t01
.
val
[
1
]);
_r0
=
vget_low_f32
(
_t0_fp32
);
if
(
tj
*
2
+
1
<
w
)
_r1
=
vget_high_f32
(
_t0_fp32
);
if
(
tj
*
2
+
2
<
w
)
_r2
=
vget_low_f32
(
_t1_fp32
);
if
(
tj
*
2
+
3
<
w
)
_r3
=
vget_high_f32
(
_t1_fp32
);
if
(
tj
*
4
+
1
<
w
)
_r1
=
vget_high_f32
(
_t0_fp32
);
if
(
tj
*
4
+
2
<
w
)
_r2
=
vget_low_f32
(
_t1_fp32
);
if
(
tj
*
4
+
3
<
w
)
_r3
=
vget_high_f32
(
_t1_fp32
);
if
(
tj
*
4
+
4
<
w
)
{
float
tmp
[
2
]
=
{
bfloat16_to_float32
(
r0
[
4
]),
bfloat16_to_float32
(
r1
[
4
])};
...
...
@@ -3211,9 +3211,9 @@ static inline void conv3x3s1_winograd63_transform_input_tile_bf16s(const Mat& bo
float32x4_t
_t1_fp32
=
bfloat2float
(
_t01
.
val
[
1
]);
_r0
=
vget_low_f32
(
_t0_fp32
);
if
(
tj
*
2
+
1
<
w
)
_r1
=
vget_high_f32
(
_t0_fp32
);
if
(
tj
*
2
+
2
<
w
)
_r2
=
vget_low_f32
(
_t1_fp32
);
if
(
tj
*
2
+
3
<
w
)
_r3
=
vget_high_f32
(
_t1_fp32
);
if
(
tj
*
6
+
1
<
w
)
_r1
=
vget_high_f32
(
_t0_fp32
);
if
(
tj
*
6
+
2
<
w
)
_r2
=
vget_low_f32
(
_t1_fp32
);
if
(
tj
*
6
+
3
<
w
)
_r3
=
vget_high_f32
(
_t1_fp32
);
if
(
tj
*
6
+
4
<
w
)
{
_t0
=
vld1_u16
(
r0
+
4
);
...
...
src/layer/x86/convolution_int8.h
已删除
100644 → 0
浏览文件 @
e8645e91
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_int8
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_int8
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
int
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
int
sum
=
0
;
// const signed char* kptr = weight_data_int8.channel(p);
const
signed
char
*
kptr
=
(
const
signed
char
*
)
weight_data_int8
+
maxk
*
channels
*
p
;
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
signed
char
*
sptr
=
m
.
row
<
signed
char
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
signed
char
val
=
sptr
[
space_ofs
[
k
]];
signed
char
w
=
kptr
[
k
];
sum
+=
val
*
w
;
}
kptr
+=
maxk
;
}
outptr
[
j
]
=
sum
;
}
outptr
+=
outw
;
}
}
}
src/layer/x86/convolution_pack1to4_int8.h
已删除
100644 → 0
浏览文件 @
e8645e91
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack1to4_int8_sse
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_int8
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
int
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
__m128i
_sum0
=
_mm_setzero_si128
();
const
signed
char
*
kptr
=
weight_data_int8
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
signed
char
*
sptr
=
m
.
row
<
const
signed
char
>
(
i
*
stride_h
)
+
j
*
stride_w
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
__m128i
_val
=
_mm_set1_epi16
((
short
)
sptr
[
space_ofs
[
k
]]);
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i
_w
=
_mm_loadl_epi64
((
const
__m128i
*
)
kptr
);
_w
=
_mm_unpacklo_epi8
(
_w
,
_mm_cmpgt_epi8
(
_mm_setzero_si128
(),
_w
));
__m128i
_sl
=
_mm_mullo_epi16
(
_val
,
_w
);
__m128i
_sh
=
_mm_mulhi_epi16
(
_val
,
_w
);
__m128i
_s0
=
_mm_unpacklo_epi16
(
_sl
,
_sh
);
_sum0
=
_mm_add_epi32
(
_sum0
,
_s0
);
kptr
+=
4
;
}
}
_mm_storeu_si128
((
__m128i
*
)(
outptr
+
j
*
4
),
_sum0
);
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/x86/convolution_pack8to1_int8.h
已删除
100644 → 0
浏览文件 @
e8645e91
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack8to1_int8_sse
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_int8
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
int
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
int
sum
=
0
;
const
signed
char
*
kptr
=
weight_data_int8
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
signed
char
*
sptr
=
m
.
row
<
const
signed
char
>
(
i
*
stride_h
)
+
j
*
stride_w
*
8
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i
_val
=
_mm_loadl_epi64
((
const
__m128i
*
)(
sptr
+
space_ofs
[
k
]
*
8
));
_val
=
_mm_unpacklo_epi8
(
_val
,
_mm_cmpgt_epi8
(
_mm_setzero_si128
(),
_val
));
__m128i
_w
=
_mm_loadl_epi64
((
const
__m128i
*
)
kptr
);
_w
=
_mm_unpacklo_epi8
(
_w
,
_mm_cmpgt_epi8
(
_mm_setzero_si128
(),
_w
));
__m128i
_sl
=
_mm_mullo_epi16
(
_val
,
_w
);
__m128i
_sh
=
_mm_mulhi_epi16
(
_val
,
_w
);
__m128i
_s0
=
_mm_unpacklo_epi16
(
_sl
,
_sh
);
__m128i
_s1
=
_mm_unpackhi_epi16
(
_sl
,
_sh
);
__m128i
_s4
=
_mm_add_epi32
(
_s0
,
_s1
);
// TODO use _mm_hadd_epi32 on ssse3
int
s4
[
4
];
_mm_storeu_si128
((
__m128i
*
)
s4
,
_s4
);
sum
+=
s4
[
0
]
+
s4
[
1
]
+
s4
[
2
]
+
s4
[
3
];
// dot
kptr
+=
8
;
}
}
outptr
[
j
]
=
sum
;
}
outptr
+=
outw
;
}
}
}
src/layer/x86/convolution_pack8to4_int8.h
已删除
100644 → 0
浏览文件 @
e8645e91
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
static
void
convolution_pack8to4_int8_sse
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_int8
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
int
w
=
bottom_blob
.
w
;
int
channels
=
bottom_blob
.
c
;
int
outw
=
top_blob
.
w
;
int
outh
=
top_blob
.
h
;
int
outch
=
top_blob
.
c
;
const
int
maxk
=
kernel_w
*
kernel_h
;
// kernel offsets
std
::
vector
<
int
>
_space_ofs
(
maxk
);
int
*
space_ofs
=
&
_space_ofs
[
0
];
{
int
p1
=
0
;
int
p2
=
0
;
int
gap
=
w
*
dilation_h
-
kernel_w
*
dilation_w
;
for
(
int
i
=
0
;
i
<
kernel_h
;
i
++
)
{
for
(
int
j
=
0
;
j
<
kernel_w
;
j
++
)
{
space_ofs
[
p1
]
=
p2
;
p1
++
;
p2
+=
dilation_w
;
}
p2
+=
gap
;
}
}
// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for
(
int
p
=
0
;
p
<
outch
;
p
++
)
{
int
*
outptr
=
top_blob
.
channel
(
p
);
for
(
int
i
=
0
;
i
<
outh
;
i
++
)
{
for
(
int
j
=
0
;
j
<
outw
;
j
++
)
{
__m128i
_sum0
=
_mm_setzero_si128
();
__m128i
_sum1
=
_mm_setzero_si128
();
__m128i
_sum2
=
_mm_setzero_si128
();
__m128i
_sum3
=
_mm_setzero_si128
();
const
signed
char
*
kptr
=
weight_data_int8
.
channel
(
p
);
// channels
for
(
int
q
=
0
;
q
<
channels
;
q
++
)
{
const
Mat
m
=
bottom_blob
.
channel
(
q
);
const
signed
char
*
sptr
=
m
.
row
<
signed
char
>
(
i
*
stride_h
)
+
j
*
stride_w
*
8
;
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i
_val
=
_mm_loadl_epi64
((
const
__m128i
*
)(
sptr
+
space_ofs
[
k
]
*
8
));
_val
=
_mm_unpacklo_epi8
(
_val
,
_mm_cmpgt_epi8
(
_mm_setzero_si128
(),
_val
));
// TODO use _mm_cvtepi8_epi16 on sse4.1
__m128i
_w01
=
_mm_loadu_si128
((
const
__m128i
*
)
kptr
);
__m128i
_w23
=
_mm_loadu_si128
((
const
__m128i
*
)(
kptr
+
16
));
__m128i
_extw01
=
_mm_cmpgt_epi8
(
_mm_setzero_si128
(),
_w01
);
__m128i
_extw23
=
_mm_cmpgt_epi8
(
_mm_setzero_si128
(),
_w23
);
__m128i
_w0
=
_mm_unpacklo_epi8
(
_w01
,
_extw01
);
__m128i
_w1
=
_mm_unpackhi_epi8
(
_w01
,
_extw01
);
__m128i
_w2
=
_mm_unpacklo_epi8
(
_w23
,
_extw23
);
__m128i
_w3
=
_mm_unpackhi_epi8
(
_w23
,
_extw23
);
__m128i
_sl0
=
_mm_mullo_epi16
(
_val
,
_w0
);
__m128i
_sh0
=
_mm_mulhi_epi16
(
_val
,
_w0
);
__m128i
_sl1
=
_mm_mullo_epi16
(
_val
,
_w1
);
__m128i
_sh1
=
_mm_mulhi_epi16
(
_val
,
_w1
);
__m128i
_sl2
=
_mm_mullo_epi16
(
_val
,
_w2
);
__m128i
_sh2
=
_mm_mulhi_epi16
(
_val
,
_w2
);
__m128i
_sl3
=
_mm_mullo_epi16
(
_val
,
_w3
);
__m128i
_sh3
=
_mm_mulhi_epi16
(
_val
,
_w3
);
_sum0
=
_mm_add_epi32
(
_sum0
,
_mm_unpacklo_epi16
(
_sl0
,
_sh0
));
_sum1
=
_mm_add_epi32
(
_sum1
,
_mm_unpacklo_epi16
(
_sl1
,
_sh1
));
_sum2
=
_mm_add_epi32
(
_sum2
,
_mm_unpacklo_epi16
(
_sl2
,
_sh2
));
_sum3
=
_mm_add_epi32
(
_sum3
,
_mm_unpacklo_epi16
(
_sl3
,
_sh3
));
_sum0
=
_mm_add_epi32
(
_sum0
,
_mm_unpackhi_epi16
(
_sl0
,
_sh0
));
_sum1
=
_mm_add_epi32
(
_sum1
,
_mm_unpackhi_epi16
(
_sl1
,
_sh1
));
_sum2
=
_mm_add_epi32
(
_sum2
,
_mm_unpackhi_epi16
(
_sl2
,
_sh2
));
_sum3
=
_mm_add_epi32
(
_sum3
,
_mm_unpackhi_epi16
(
_sl3
,
_sh3
));
kptr
+=
32
;
}
}
// transpose 4x4
{
__m128i
_tmp0
,
_tmp1
,
_tmp2
,
_tmp3
;
_tmp0
=
_mm_unpacklo_epi32
(
_sum0
,
_sum1
);
_tmp1
=
_mm_unpacklo_epi32
(
_sum2
,
_sum3
);
_tmp2
=
_mm_unpackhi_epi32
(
_sum0
,
_sum1
);
_tmp3
=
_mm_unpackhi_epi32
(
_sum2
,
_sum3
);
_sum0
=
_mm_unpacklo_epi64
(
_tmp0
,
_tmp1
);
_sum1
=
_mm_unpackhi_epi64
(
_tmp0
,
_tmp1
);
_sum2
=
_mm_unpacklo_epi64
(
_tmp2
,
_tmp3
);
_sum3
=
_mm_unpackhi_epi64
(
_tmp2
,
_tmp3
);
}
_sum0
=
_mm_add_epi32
(
_sum0
,
_sum1
);
_sum2
=
_mm_add_epi32
(
_sum2
,
_sum3
);
_sum0
=
_mm_add_epi32
(
_sum0
,
_sum2
);
_mm_storeu_si128
((
__m128i
*
)(
outptr
+
j
*
4
),
_sum0
);
}
outptr
+=
outw
*
4
;
}
}
}
src/layer/x86/convolution_packed_int8.h
0 → 100644
浏览文件 @
55709708
此差异已折叠。
点击以展开。
src/layer/x86/convolution_x86.cpp
浏览文件 @
55709708
...
...
@@ -45,16 +45,14 @@ namespace ncnn {
#include "convolution_sgemm_int8.h"
#include "convolution_1x1_int8.h"
#include "convolution_3x3_int8.h"
#include "convolution_int8.h"
#include "convolution_packed_int8.h"
#endif // NCNN_INT8
#if __SSE2__
#include "convolution_3x3_pack1to4.h"
#if NCNN_INT8
#include "convolution_pack8to4_int8.h"
#include "convolution_pack1to4_int8.h"
#include "convolution_pack8to1_int8.h"
#include "convolution_sgemm_pack8to4_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
...
...
@@ -1237,42 +1235,6 @@ int Convolution_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<M
}
#if NCNN_INT8
static
void
convolution_transform_kernel_packed_int8_sse
(
const
Mat
&
weight_data
,
Mat
&
weight_data_tm
,
int
num_input
,
int
num_output
,
int
kernel_w
,
int
kernel_h
,
int
elempack
,
int
out_elempack
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
// src = kw-kh-inch-outch
// dst = pa-pb-kw-kh-inch/pa-outch/pb
{
Mat
weight_data_r2
=
weight_data
.
reshape
(
maxk
,
num_input
,
num_output
);
weight_data_tm
.
create
(
maxk
,
num_input
/
elempack
,
num_output
/
out_elempack
,
(
size_t
)
elempack
*
out_elempack
,
elempack
*
out_elempack
);
for
(
int
q
=
0
;
q
+
(
out_elempack
-
1
)
<
num_output
;
q
+=
out_elempack
)
{
signed
char
*
g00
=
weight_data_tm
.
channel
(
q
/
out_elempack
);
for
(
int
p
=
0
;
p
+
(
elempack
-
1
)
<
num_input
;
p
+=
elempack
)
{
for
(
int
k
=
0
;
k
<
maxk
;
k
++
)
{
for
(
int
i
=
0
;
i
<
out_elempack
;
i
++
)
{
for
(
int
j
=
0
;
j
<
elempack
;
j
++
)
{
const
signed
char
*
k00
=
weight_data_r2
.
channel
(
q
+
i
).
row
<
const
signed
char
>
(
p
+
j
);
g00
[
0
]
=
k00
[
k
];
g00
++
;
}
}
}
}
}
}
}
int
Convolution_x86
::
create_pipeline_int8_x86
(
const
Option
&
opt
)
{
const
int
maxk
=
kernel_w
*
kernel_h
;
...
...
@@ -1309,7 +1271,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
convolution_transform_kernel_packed_int8
_sse
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
convolution_transform_kernel_packed_int8
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
}
...
...
@@ -1341,7 +1303,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
convolution_transform_kernel_packed_int8
_sse
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
convolution_transform_kernel_packed_int8
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
}
...
...
@@ -1365,7 +1327,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
convolution_transform_kernel_packed_int8
_sse
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
,
elempack
,
out_elempack
);
convolution_transform_kernel_packed_int8
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
);
}
}
#endif // __SSE2__
...
...
@@ -1391,7 +1353,7 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
}
else
{
weight_data_tm
=
weight_data
;
convolution_transform_kernel_packed_int8
(
weight_data
,
weight_data_tm
,
num_input
,
num_output
,
kernel_w
,
kernel_h
)
;
}
}
...
...
@@ -1501,7 +1463,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_pack
8to4_int8_sse
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
convolution_pack
ed_int8
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
}
...
...
@@ -1533,7 +1495,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_pack
1to4_int8_sse
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
convolution_pack
ed_int8
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
}
...
...
@@ -1557,7 +1519,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_pack
8to1_int8_sse
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
convolution_pack
ed_int8
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
}
#endif // __SSE2__
...
...
@@ -1583,7 +1545,7 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
}
else
{
convolution_int8
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
convolution_
packed_
int8
(
bottom_blob_bordered
,
top_blob_int32
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
}
...
...
src/layer/x86/convolution_x86_avx2.cpp
浏览文件 @
55709708
...
...
@@ -18,6 +18,7 @@
namespace
ncnn
{
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
...
...
@@ -25,6 +26,17 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void
convolution_transform_kernel_packed_int8_avx2
(
const
Mat
&
kernel
,
Mat
&
kernel_tm
,
int
inch
,
int
outch
,
int
kernel_w
,
int
kernel_h
)
{
convolution_transform_kernel_packed_int8
(
kernel
,
kernel_tm
,
inch
,
outch
,
kernel_w
,
kernel_h
);
}
void
convolution_packed_int8_avx2
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_tm
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
convolution_packed_int8
(
bottom_blob
,
top_blob
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
// pack1
void
im2col_sgemm_int8_sse_avx2
(
const
Mat
&
bottom_im2col
,
Mat
&
top_blob
,
const
Mat
&
kernel
,
const
Option
&
opt
)
{
...
...
src/layer/x86/convolution_x86_avx512vnni.cpp
浏览文件 @
55709708
...
...
@@ -18,6 +18,7 @@
namespace
ncnn
{
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
...
...
@@ -25,6 +26,12 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void
convolution_packed_int8_avx512vnni
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_tm
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
convolution_packed_int8
(
bottom_blob
,
top_blob
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
// pack1
void
im2col_sgemm_int8_sse_avx512vnni
(
const
Mat
&
bottom_im2col
,
Mat
&
top_blob
,
const
Mat
&
kernel
,
const
Option
&
opt
)
{
...
...
src/layer/x86/convolution_x86_avxvnni.cpp
浏览文件 @
55709708
...
...
@@ -18,6 +18,7 @@
namespace
ncnn
{
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
...
...
@@ -25,6 +26,12 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void
convolution_packed_int8_avxvnni
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_tm
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
convolution_packed_int8
(
bottom_blob
,
top_blob
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
// pack1
void
im2col_sgemm_int8_sse_avxvnni
(
const
Mat
&
bottom_im2col
,
Mat
&
top_blob
,
const
Mat
&
kernel
,
const
Option
&
opt
)
{
...
...
src/layer/x86/convolution_x86_xop.cpp
浏览文件 @
55709708
...
...
@@ -18,6 +18,7 @@
namespace
ncnn
{
#include "convolution_packed_int8.h"
#include "convolution_sgemm_int8.h"
#include "convolution_sgemm_pack1to4_int8.h"
#include "convolution_sgemm_pack8to1_int8.h"
...
...
@@ -25,6 +26,12 @@ namespace ncnn {
#include "convolution_3x3_pack8to1_int8.h"
#include "convolution_3x3_pack8to4_int8.h"
// packed
void
convolution_packed_int8_xop
(
const
Mat
&
bottom_blob
,
Mat
&
top_blob
,
const
Mat
&
weight_data_tm
,
int
kernel_w
,
int
kernel_h
,
int
dilation_w
,
int
dilation_h
,
int
stride_w
,
int
stride_h
,
const
Option
&
opt
)
{
convolution_packed_int8
(
bottom_blob
,
top_blob
,
weight_data_tm
,
kernel_w
,
kernel_h
,
dilation_w
,
dilation_h
,
stride_w
,
stride_h
,
opt
);
}
// pack1
void
im2col_sgemm_int8_sse_xop
(
const
Mat
&
bottom_im2col
,
Mat
&
top_blob
,
const
Mat
&
kernel
,
const
Option
&
opt
)
{
...
...
src/layer/x86/x86_usability.h
浏览文件 @
55709708
...
...
@@ -967,6 +967,60 @@ static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m
_r7
=
_mm256_permute2x128_si256
(
_tmp6
,
_tmp7
,
_MM_SHUFFLE
(
0
,
3
,
0
,
1
));
}
static
NCNN_FORCEINLINE
void
transpose8x16_epi16
(
__m128i
&
_r0
,
__m128i
&
_r1
,
__m128i
&
_r2
,
__m128i
&
_r3
,
__m128i
&
_r4
,
__m128i
&
_r5
,
__m128i
&
_r6
,
__m128i
&
_r7
,
__m128i
&
_r8
,
__m128i
&
_r9
,
__m128i
&
_ra
,
__m128i
&
_rb
,
__m128i
&
_rc
,
__m128i
&
_rd
,
__m128i
&
_re
,
__m128i
&
_rf
)
{
__m128i
_tmp0
=
_mm_unpacklo_epi16
(
_r0
,
_r1
);
__m128i
_tmp1
=
_mm_unpackhi_epi16
(
_r0
,
_r1
);
__m128i
_tmp2
=
_mm_unpacklo_epi16
(
_r2
,
_r3
);
__m128i
_tmp3
=
_mm_unpackhi_epi16
(
_r2
,
_r3
);
__m128i
_tmp4
=
_mm_unpacklo_epi16
(
_r4
,
_r5
);
__m128i
_tmp5
=
_mm_unpackhi_epi16
(
_r4
,
_r5
);
__m128i
_tmp6
=
_mm_unpacklo_epi16
(
_r6
,
_r7
);
__m128i
_tmp7
=
_mm_unpackhi_epi16
(
_r6
,
_r7
);
__m128i
_tmp8
=
_mm_unpacklo_epi16
(
_r8
,
_r9
);
__m128i
_tmp9
=
_mm_unpackhi_epi16
(
_r8
,
_r9
);
__m128i
_tmpa
=
_mm_unpacklo_epi16
(
_ra
,
_rb
);
__m128i
_tmpb
=
_mm_unpackhi_epi16
(
_ra
,
_rb
);
__m128i
_tmpc
=
_mm_unpacklo_epi16
(
_rc
,
_rd
);
__m128i
_tmpd
=
_mm_unpackhi_epi16
(
_rc
,
_rd
);
__m128i
_tmpe
=
_mm_unpacklo_epi16
(
_re
,
_rf
);
__m128i
_tmpf
=
_mm_unpackhi_epi16
(
_re
,
_rf
);
__m128i
_tmpg
=
_mm_unpacklo_epi32
(
_tmp0
,
_tmp2
);
__m128i
_tmph
=
_mm_unpackhi_epi32
(
_tmp0
,
_tmp2
);
__m128i
_tmpi
=
_mm_unpacklo_epi32
(
_tmp1
,
_tmp3
);
__m128i
_tmpj
=
_mm_unpackhi_epi32
(
_tmp1
,
_tmp3
);
__m128i
_tmpk
=
_mm_unpacklo_epi32
(
_tmp4
,
_tmp6
);
__m128i
_tmpl
=
_mm_unpackhi_epi32
(
_tmp4
,
_tmp6
);
__m128i
_tmpm
=
_mm_unpacklo_epi32
(
_tmp5
,
_tmp7
);
__m128i
_tmpn
=
_mm_unpackhi_epi32
(
_tmp5
,
_tmp7
);
__m128i
_tmpo
=
_mm_unpacklo_epi32
(
_tmp8
,
_tmpa
);
__m128i
_tmpp
=
_mm_unpackhi_epi32
(
_tmp8
,
_tmpa
);
__m128i
_tmpq
=
_mm_unpacklo_epi32
(
_tmp9
,
_tmpb
);
__m128i
_tmpr
=
_mm_unpackhi_epi32
(
_tmp9
,
_tmpb
);
__m128i
_tmps
=
_mm_unpacklo_epi32
(
_tmpc
,
_tmpe
);
__m128i
_tmpt
=
_mm_unpackhi_epi32
(
_tmpc
,
_tmpe
);
__m128i
_tmpu
=
_mm_unpacklo_epi32
(
_tmpd
,
_tmpf
);
__m128i
_tmpv
=
_mm_unpackhi_epi32
(
_tmpd
,
_tmpf
);
_r0
=
_mm_unpacklo_epi64
(
_tmpg
,
_tmpk
);
_r1
=
_mm_unpacklo_epi64
(
_tmpo
,
_tmps
);
_r2
=
_mm_unpackhi_epi64
(
_tmpg
,
_tmpk
);
_r3
=
_mm_unpackhi_epi64
(
_tmpo
,
_tmps
);
_r4
=
_mm_unpacklo_epi64
(
_tmph
,
_tmpl
);
_r5
=
_mm_unpacklo_epi64
(
_tmpp
,
_tmpt
);
_r6
=
_mm_unpackhi_epi64
(
_tmph
,
_tmpl
);
_r7
=
_mm_unpackhi_epi64
(
_tmpp
,
_tmpt
);
_r8
=
_mm_unpacklo_epi64
(
_tmpi
,
_tmpm
);
_r9
=
_mm_unpacklo_epi64
(
_tmpq
,
_tmpu
);
_ra
=
_mm_unpackhi_epi64
(
_tmpi
,
_tmpm
);
_rb
=
_mm_unpackhi_epi64
(
_tmpq
,
_tmpu
);
_rc
=
_mm_unpacklo_epi64
(
_tmpj
,
_tmpn
);
_rd
=
_mm_unpacklo_epi64
(
_tmpr
,
_tmpv
);
_re
=
_mm_unpackhi_epi64
(
_tmpj
,
_tmpn
);
_rf
=
_mm_unpackhi_epi64
(
_tmpr
,
_tmpv
);
}
static
NCNN_FORCEINLINE
float
_mm512_comp_reduce_add_ps
(
__m512
x
)
{
const
__m256
x256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
x
),
_mm512_extractf32x8_ps
(
x
,
1
));
...
...
tests/test_convolution.cpp
浏览文件 @
55709708
...
...
@@ -46,6 +46,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
{
...
...
@@ -65,6 +66,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -85,6 +87,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -98,6 +101,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
#endif // __aarch64__
...
...
tests/test_convolution_1.cpp
浏览文件 @
55709708
...
...
@@ -46,6 +46,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
{
...
...
@@ -65,6 +66,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -85,6 +87,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -98,6 +101,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
#endif // __aarch64__
...
...
tests/test_convolution_2.cpp
浏览文件 @
55709708
...
...
@@ -48,6 +48,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
{
...
...
@@ -67,6 +68,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -87,6 +89,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -99,6 +102,7 @@ static int test_convolution(int w, int h, int c, int outch, int kernel, int dila
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
tests/test_convolution_3.cpp
浏览文件 @
55709708
...
...
@@ -187,6 +187,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
requant
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
{
...
...
@@ -206,6 +207,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
requant
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -226,6 +228,7 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
if
(
ret
!=
0
)
{
fprintf
(
stderr
,
"test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]
\n
"
,
w
,
h
,
c
,
outch
,
kernel
,
dilation
,
stride
,
pad
,
bias
,
requant
,
activation_type
,
activation_params
[
0
],
activation_params
[
1
]);
return
ret
;
}
}
...
...
@@ -309,6 +312,82 @@ static int test_convolution_1()
||
test_convolution_int8
(
25
,
33
,
16
,
15
,
3
,
1
,
1
,
1
,
0
)
||
test_convolution_int8
(
7
,
7
,
15
,
12
,
3
,
1
,
1
,
1
,
0
);
}
static
int
test_convolution_1_2
()
{
return
0
||
test_convolution_int8
(
19
,
17
,
1
,
1
,
3
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
1
,
3
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
1
,
3
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
1
,
3
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
1
,
3
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
1
,
3
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
1
,
3
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
1
,
3
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
1
,
2
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
2
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
2
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
2
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
2
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
2
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
2
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
2
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
1
,
7
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
7
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
7
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
7
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
7
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
7
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
7
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
7
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
1
,
8
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
8
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
8
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
8
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
8
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
8
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
8
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
8
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
1
,
15
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
15
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
15
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
15
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
15
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
15
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
15
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
15
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
1
,
16
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
16
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
16
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
16
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
16
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
16
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
16
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
16
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
1
,
31
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
31
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
31
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
31
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
31
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
31
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
31
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
31
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
1
,
32
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
2
,
32
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
7
,
32
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
8
,
32
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
15
,
32
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
16
,
32
,
5
,
2
,
2
,
0
,
0
)
||
test_convolution_int8
(
19
,
17
,
31
,
32
,
5
,
2
,
2
,
0
,
1
)
||
test_convolution_int8
(
19
,
17
,
32
,
32
,
5
,
2
,
2
,
0
,
0
);
}
#endif // NCNN_INT8
int
main
()
...
...
@@ -318,6 +397,7 @@ int main()
#if NCNN_INT8
return
0
||
test_convolution_1
()
||
test_convolution_1_2
()
||
test_convolution_2
()
||
test_convolution_3
();
#else
...
...