Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
8d06c2b8
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
8d06c2b8
编写于
8月 27, 2020
作者:
Y
yangruoqi713
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[MS][LITE] optimize arm cpu int8 op conv depthwise: add common and slide window functions to select
上级
e6112ed1
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
686 addition
and
76 deletion
+686
-76
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
+9
-9
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
+169
-0
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
+122
-0
mindspore/lite/nnacl/int8/common_func.h
mindspore/lite/nnacl/int8/common_func.h
+4
-0
mindspore/lite/nnacl/int8/conv_depthwise_int8.c
mindspore/lite/nnacl/int8/conv_depthwise_int8.c
+96
-3
mindspore/lite/nnacl/int8/conv_depthwise_int8.h
mindspore/lite/nnacl/int8/conv_depthwise_int8.h
+6
-2
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
...src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+43
-57
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
.../src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
+4
-5
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
...kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
+182
-0
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
.../kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
+51
-0
未找到文件。
mindspore/lite/nnacl/assembly/arm64/ConvDwFp32Row.S
浏览文件 @
8d06c2b8
...
...
@@ -29,7 +29,7 @@ mov x6, x1
mov
x7
,
x2
mov
x8
,
x4
Loop
Input
Depth16In
:
LoopDepth16In
:
cmp
x8
,
#
16
blt
L4
sub
x8
,
x8
,
#
16
...
...
@@ -39,8 +39,8 @@ mov x8, x4
ld1
{
v16
.4
s
,
v17
.4
s
},
[
x0
],
#
32
cmp
x8
,
#
16
blt
Loop
Input
Depth16Out
Loop
Input
Depth16
:
blt
LoopDepth16Out
LoopDepth16
:
fmla
v16
.4
s
,
v0
.4
s
,
v2
.4
s
fmla
v17
.4
s
,
v1
.4
s
,
v3
.4
s
...
...
@@ -61,9 +61,9 @@ mov x8, x4
sub
x8
,
x8
,
#
16
cmp
x8
,
#
16
bge
Loop
Input
Depth16
bge
LoopDepth16
Loop
Input
Depth16Out
:
LoopDepth16Out
:
fmla
v16
.4
s
,
v0
.4
s
,
v2
.4
s
fmla
v17
.4
s
,
v1
.4
s
,
v3
.4
s
st1
{
v16
.4
s
,
v17
.4
s
},
[
x9
],
#
32
...
...
@@ -81,7 +81,7 @@ mov x8, x4
cmp
x8
,
#
4
blt
L0
Loop
Input
Depth4
:
LoopDepth4
:
ld1
{
v0
.4
s
},
[
x6
],
#
16
ld1
{
v2
.4
s
},
[
x7
],
#
16
ld1
{
v16
.4
s
},
[
x0
],
#
16
...
...
@@ -89,13 +89,13 @@ mov x8, x4
st1
{
v16
.4
s
},
[
x9
],
#
16
sub
x8
,
x8
,
#
4
cmp
x8
,
#
4
bge
Loop
Input
Depth4
bge
LoopDepth4
L0
:
cmp
x8
,
#
0
beq
Loop16LineEnd
Loop
Input
Depth0
:
LoopDepth0
:
ldr
s0
,
[
x6
],
#
4
ldr
s1
,
[
x7
],
#
4
ldr
s2
,
[
x0
],
#
4
...
...
@@ -103,7 +103,7 @@ mov x8, x4
fadd
s2
,
s2
,
s0
str
s2
,
[
x9
],
#
4
subs
x8
,
x8
,
#
1
bne
Loop
Input
Depth0
bne
LoopDepth0
Loop16LineEnd
:
...
...
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8PostAlign4.S
0 → 100644
浏览文件 @
8d06c2b8
#ifdef __aarch64__
.
text
.
align
5
.
global
ConvDwInt8PostAlign4
#ifndef __APPLE__
.
type
ConvDwInt8PostAlign4
,
%
function
#endif
//
void
ConvDwInt8PostAlign4
(
int8_t
*
dst
,
int32_t
*
buffer
,
int
num_pixels
,
int32_t
output_zp
,
int32_t
out_multiplier
,
//
int32_t
left_shift
,
int32_t
right_shift
,
int32_t
acc_min
,
int32_t
acc_max
)
;
//
x0
:
dst
,
x1
:
buffer
,
x2
:
num_pixels
,
x3
:
output_zp
,
x4
:
out_multiplier
,
//
x5
:
left_shift
,
x6
:
right_shift
,
x7
:
acc_min
,
x8
:
acc_max
ConvDwInt8PostAlign4
:
//
registers
v8
~
v15
must
be
preserved
by
a
callee
across
subroutine
calls
,
according
to
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
ldr
x8
,
[
sp
]
dup
v26
.4
s
,
w5
dup
v27
.4
s
,
w4
dup
v28
.4
s
,
w6
dup
v29
.4
s
,
w3
dup
v30
.4
s
,
w7
dup
v31
.4
s
,
w8
cmp
x2
,
16
blt
LoopDepth8
LoopDepth16
:
ld1
{
v0
.4
s
},
[
x1
],
#
16
ld1
{
v1
.4
s
},
[
x1
],
#
16
ld1
{
v2
.4
s
},
[
x1
],
#
16
ld1
{
v3
.4
s
},
[
x1
],
#
16
sqshl
v0
.4
s
,
v0
.4
s
,
v26
.4
s
sqshl
v1
.4
s
,
v1
.4
s
,
v26
.4
s
sqshl
v2
.4
s
,
v2
.4
s
,
v26
.4
s
sqshl
v3
.4
s
,
v3
.4
s
,
v26
.4
s
sqrdmulh
v0
.4
s
,
v0
.4
s
,
v27
.4
s
sqrdmulh
v1
.4
s
,
v1
.4
s
,
v27
.4
s
sqrdmulh
v2
.4
s
,
v2
.4
s
,
v27
.4
s
sqrdmulh
v3
.4
s
,
v3
.4
s
,
v27
.4
s
and
v16
.16
b
,
v28
.16
b
,
v0
.16
b
sshr
v16
.4
s
,
v16
.4
s
,
#
31
sqadd
v0
.4
s
,
v0
.4
s
,
v16
.4
s
srshl
v0
.4
s
,
v0
.4
s
,
v28
.4
s
and
v17
.16
b
,
v28
.16
b
,
v1
.16
b
sshr
v17
.4
s
,
v17
.4
s
,
#
31
sqadd
v1
.4
s
,
v1
.4
s
,
v17
.4
s
srshl
v1
.4
s
,
v1
.4
s
,
v28
.4
s
and
v18
.16
b
,
v28
.16
b
,
v2
.16
b
sshr
v18
.4
s
,
v18
.4
s
,
#
31
sqadd
v2
.4
s
,
v2
.4
s
,
v18
.4
s
srshl
v2
.4
s
,
v2
.4
s
,
v28
.4
s
and
v19
.16
b
,
v28
.16
b
,
v3
.16
b
sshr
v19
.4
s
,
v19
.4
s
,
#
31
sqadd
v3
.4
s
,
v3
.4
s
,
v19
.4
s
srshl
v3
.4
s
,
v3
.4
s
,
v28
.4
s
add
v0
.4
s
,
v0
.4
s
,
v29
.4
s
add
v1
.4
s
,
v1
.4
s
,
v29
.4
s
add
v2
.4
s
,
v2
.4
s
,
v29
.4
s
add
v3
.4
s
,
v3
.4
s
,
v29
.4
s
smax
v0
.4
s
,
v0
.4
s
,
v30
.4
s
smax
v1
.4
s
,
v1
.4
s
,
v30
.4
s
smax
v2
.4
s
,
v2
.4
s
,
v30
.4
s
smax
v3
.4
s
,
v3
.4
s
,
v30
.4
s
smin
v0
.4
s
,
v0
.4
s
,
v31
.4
s
smin
v1
.4
s
,
v1
.4
s
,
v31
.4
s
smin
v2
.4
s
,
v2
.4
s
,
v31
.4
s
smin
v3
.4
s
,
v3
.4
s
,
v31
.4
s
sqxtn
v0
.4
h
,
v0
.4
s
sqxtn
v1
.4
h
,
v1
.4
s
sqxtn
v2
.4
h
,
v2
.4
s
sqxtn
v3
.4
h
,
v3
.4
s
sqxtn
v0
.8
b
,
v0
.8
h
sqxtn
v1
.8
b
,
v1
.8
h
sqxtn
v2
.8
b
,
v2
.8
h
sqxtn
v3
.8
b
,
v3
.8
h
st1
{
v0
.
s
}[
0
],
[
x0
],
#
4
st1
{
v1
.
s
}[
0
],
[
x0
],
#
4
st1
{
v2
.
s
}[
0
],
[
x0
],
#
4
st1
{
v3
.
s
}[
0
],
[
x0
],
#
4
sub
x2
,
x2
,
#
16
cmp
x2
,
#
16
bge
LoopDepth16
LoopDepth8
:
cmp
x2
,
#
8
blt
LoopDepth4
ld1
{
v0
.4
s
},
[
x1
],
#
16
ld1
{
v1
.4
s
},
[
x1
],
#
16
sqshl
v0
.4
s
,
v0
.4
s
,
v26
.4
s
sqshl
v1
.4
s
,
v1
.4
s
,
v26
.4
s
sqrdmulh
v0
.4
s
,
v0
.4
s
,
v27
.4
s
sqrdmulh
v1
.4
s
,
v1
.4
s
,
v27
.4
s
and
v16
.16
b
,
v28
.16
b
,
v0
.16
b
sshr
v16
.4
s
,
v16
.4
s
,
#
31
sqadd
v0
.4
s
,
v0
.4
s
,
v16
.4
s
srshl
v0
.4
s
,
v0
.4
s
,
v28
.4
s
and
v17
.16
b
,
v28
.16
b
,
v1
.16
b
sshr
v17
.4
s
,
v17
.4
s
,
#
31
sqadd
v1
.4
s
,
v1
.4
s
,
v17
.4
s
srshl
v1
.4
s
,
v1
.4
s
,
v28
.4
s
add
v0
.4
s
,
v0
.4
s
,
v29
.4
s
add
v1
.4
s
,
v1
.4
s
,
v29
.4
s
smax
v0
.4
s
,
v0
.4
s
,
v30
.4
s
smax
v1
.4
s
,
v1
.4
s
,
v30
.4
s
smin
v0
.4
s
,
v0
.4
s
,
v31
.4
s
smin
v1
.4
s
,
v1
.4
s
,
v31
.4
s
sqxtn
v0
.4
h
,
v0
.4
s
sqxtn
v1
.4
h
,
v1
.4
s
sqxtn
v0
.8
b
,
v0
.8
h
sqxtn
v1
.8
b
,
v1
.8
h
st1
{
v0
.
s
}[
0
],
[
x0
],
#
4
st1
{
v1
.
s
}[
0
],
[
x0
],
#
4
sub
x2
,
x2
,
#
8
cmp
x2
,
#
8
bge
LoopDepth8
LoopDepth4
:
cmp
x2
,
#
4
blt
End
ld1
{
v0
.4
s
},
[
x1
],
#
16
sqshl
v0
.4
s
,
v0
.4
s
,
v26
.4
s
sqrdmulh
v0
.4
s
,
v0
.4
s
,
v27
.4
s
and
v16
.16
b
,
v28
.16
b
,
v0
.16
b
sshr
v16
.4
s
,
v16
.4
s
,
#
31
sqadd
v0
.4
s
,
v0
.4
s
,
v16
.4
s
srshl
v0
.4
s
,
v0
.4
s
,
v28
.4
s
add
v0
.4
s
,
v0
.4
s
,
v29
.4
s
smax
v0
.4
s
,
v0
.4
s
,
v30
.4
s
smin
v0
.4
s
,
v0
.4
s
,
v31
.4
s
sqxtn
v0
.4
h
,
v0
.4
s
sqxtn
v0
.8
b
,
v0
.8
h
st1
{
v0
.
s
}[
0
],
[
x0
],
#
4
sub
x2
,
x2
,
#
4
bge
LoopDepth4
End
:
ret
#endif
mindspore/lite/nnacl/assembly/arm64/ConvDwInt8Row.S
0 → 100644
浏览文件 @
8d06c2b8
#ifdef __aarch64__
.
text
.
align
5
.
global
ConvDwInt8Row
#ifndef __APPLE__
.
type
ConvDwInt8Row
,
%
function
#endif
//
void
ConvDwInt8Row
(
int32_t
*
output_ptr
,
const
int8_t
*
input_ptr
,
const
int16_t
*
weight_ptr
,
int
num_pixels
,
//
int
output_channel
,
int
input_step
,
int8_t
input_zp
)
//
x0
:
output_ptr
,
x1
:
input_ptr
,
x2
:
weight_ptr
,
x3
:
num_pixels
,
//
x4
:
output_channel
,
x5
:
input_step
,
x6
:
input_zp
//
ConvDwInt8Row
:
//
registers
v8
~
v15
must
be
preserved
by
a
callee
across
subroutine
calls
,
according
to
//
https
:
//
github
.
com
/
ARM
-
software
/
abi
-
aa
/
blob
/
master
/
aapcs64
/
aapcs64
.
rst
#
simd
-
and
-
floating
-
point
-
registers
//
x19
~
x29
should
be
also
preserved
//
whereas
our
coding
style
do
not
permit
such
amount
of
parameters
cmp
x3
,
#
0
beq
End
mov
x10
,
x0
dup
v31.
8
b
,
w6
LoopOutPixel
:
mov
x7
,
x1
mov
x8
,
x2
mov
x9
,
x4
LoopDepth16In
:
cmp
x9
,
#
16
blt
L8
sub
x9
,
x9
,
#
16
ld1
{
v0
.8
b
,
v1
.8
b
},
[
x7
],
#
16
ld1
{
v2
.8
h
,
v3
.8
h
},
[
x8
],
#
32
ld1
{
v16
.4
s
,
v17
.4
s
},
[
x0
],
#
32
ssubl
v20
.8
h
,
v0
.8
b
,
v31
.8
b
smlal
v16
.4
s
,
v20
.4
h
,
v2
.4
h
smlal2
v17
.4
s
,
v20
.8
h
,
v2
.8
h
cmp
x9
,
#
16
blt
LoopDepth16Out
LoopDepth16
:
st1
{
v16
.4
s
,
v17
.4
s
},
[
x10
],
#
32
ld1
{
v18
.4
s
,
v19
.4
s
},
[
x0
],
#
32
ssubl
v21
.8
h
,
v1
.8
b
,
v31
.8
b
smlal
v18
.4
s
,
v21
.4
h
,
v3
.4
h
smlal2
v19
.4
s
,
v21
.8
h
,
v3
.8
h
st1
{
v18
.4
s
,
v19
.4
s
},
[
x10
],
#
32
ld1
{
v0
.8
b
,
v1
.8
b
},
[
x7
],
#
16
ld1
{
v2
.8
h
,
v3
.8
h
},
[
x8
],
#
32
ld1
{
v16
.4
s
,
v17
.4
s
},
[
x0
],
#
32
ssubl
v20
.8
h
,
v0
.8
b
,
v31
.8
b
smlal
v16
.4
s
,
v20
.4
h
,
v2
.4
h
smlal2
v17
.4
s
,
v20
.8
h
,
v2
.8
h
sub
x9
,
x9
,
#
16
cmp
x9
,
#
16
bge
LoopDepth16
LoopDepth16Out
:
st1
{
v16
.4
s
,
v17
.4
s
},
[
x10
],
#
32
ld1
{
v18
.4
s
,
v19
.4
s
},
[
x0
],
#
32
ssubl
v21
.8
h
,
v1
.8
b
,
v31
.8
b
smlal
v18
.4
s
,
v21
.4
h
,
v3
.4
h
smlal2
v19
.4
s
,
v21
.8
h
,
v3
.8
h
st1
{
v18
.4
s
,
v19
.4
s
},
[
x10
],
#
32
L8
:
cmp
x9
,
#
8
blt
L0
LoopDepth8
:
ld1
{
v0
.8
b
},
[
x7
],
#
8
ld1
{
v2
.8
h
},
[
x8
],
#
16
ld1
{
v16
.4
s
,
v17
.4
s
},
[
x0
],
#
32
ssubl
v20
.8
h
,
v0
.8
b
,
v31
.8
b
smlal
v16
.4
s
,
v20
.4
h
,
v2
.4
h
smlal2
v17
.4
s
,
v20
.8
h
,
v2
.8
h
st1
{
v16
.4
s
,
v17
.4
s
},
[
x10
],
#
32
sub
x9
,
x9
,
#
8
cmp
x9
,
#
8
bge
LoopDepth8
L0
:
cmp
x9
,
#
0
beq
Loop16LineEnd
LoopDepth0
:
ldrsb
w14
,
[
x7
],
#
1
ldrsh
w15
,
[
x8
],
#
2
ldr
w16
,
[
x0
],
#
4
add
w14
,
w14
,
w6
sxth
w14
,
w14
madd
w14
,
w14
,
w15
,
w16
str
w14
,
[
x10
],
#
4
subs
x9
,
x9
,
#
1
bne
LoopDepth0
Loop16LineEnd
:
subs
x3
,
x3
,
#
1
add
x1
,
x1
,
x5
bne
LoopOutPixel
End
:
ret
#endif
mindspore/lite/nnacl/int8/common_func.h
浏览文件 @
8d06c2b8
...
...
@@ -49,6 +49,10 @@ void ConvDwInt8Center(int8_t *dst, const int16_t *src, const int16_t *weight, co
size_t
width
,
size_t
kernel_h
,
size_t
kernel_w
,
size_t
out_h_step
,
size_t
block_channel
,
size_t
in_sh_step
,
size_t
in_sw_step
,
size_t
in_kh_step
,
size_t
in_kw_step
,
int
out_multiplier
,
int
left_shift
,
int
right_shift
,
int32_t
out_zp
,
int32_t
acc_min
,
int32_t
acc_max
);
void
ConvDwInt8Row
(
int32_t
*
output_ptr
,
const
int8_t
*
input_ptr
,
const
int16_t
*
weight_ptr
,
int
num_pixels
,
int
output_channel
,
int
input_step
,
int8_t
input_zp
);
void
ConvDwInt8PostAlign4
(
int8_t
*
dst
,
int32_t
*
buffer
,
int
num_pixels
,
int32_t
output_zp
,
int32_t
out_multiplier
,
int32_t
left_shift
,
int32_t
right_shift
,
int32_t
acc_min
,
int32_t
acc_max
);
#endif
#ifdef __cplusplus
...
...
mindspore/lite/nnacl/int8/conv_depthwise_int8.c
浏览文件 @
8d06c2b8
...
...
@@ -20,6 +20,99 @@
#include "nnacl/int8/common_func.h"
/*conv depthwise int8 begin*/
// only support perlayer
#ifndef ENABLE_ARM64
void
ConvDwInt8Row
(
int32_t
*
output_ptr
,
const
int8_t
*
input_ptr
,
const
int16_t
*
weight_ptr
,
int
num_pixels
,
int
output_channel
,
int
input_step
,
int8_t
input_zp
)
{
for
(
int
i
=
0
;
i
<
num_pixels
;
i
++
)
{
for
(
int
c
=
0
;
c
<
output_channel
;
c
++
)
{
const
int16_t
input
=
input_ptr
[
c
]
-
input_zp
;
*
output_ptr
++
+=
input
*
weight_ptr
[
c
];
}
input_ptr
+=
input_step
;
}
}
#endif
void
ConvDwInt8Post
(
int8_t
*
dst
,
int32_t
*
buffer
,
int
num_pixels
,
int32_t
output_zp
,
int32_t
out_multiplier
,
int32_t
left_shift
,
int32_t
right_shift
,
int32_t
acc_min
,
int32_t
acc_max
)
{
int
align_num
=
0
;
#ifdef ENABLE_ARM64
align_num
=
num_pixels
/
4
*
4
;
ConvDwInt8PostAlign4
(
dst
,
buffer
,
align_num
,
output_zp
,
out_multiplier
,
left_shift
,
right_shift
,
acc_min
,
acc_max
);
#endif
for
(
int
i
=
align_num
;
i
<
num_pixels
;
i
++
)
{
buffer
[
i
]
=
RoundingDivideByPOT
(
SaturatingRoundingDoublingHighMul
(
buffer
[
i
]
*
(
1
<<
(
unsigned
int
)
left_shift
),
out_multiplier
),
-
right_shift
);
buffer
[
i
]
+=
output_zp
;
buffer
[
i
]
=
MSMAX
(
buffer
[
i
],
acc_min
);
buffer
[
i
]
=
MSMIN
(
buffer
[
i
],
acc_max
);
dst
[
i
]
=
(
buffer
[
i
]);
}
}
void
ConvDwInt8
(
int8_t
*
output_data
,
int32_t
*
row_buffer
,
const
int8_t
*
input_data
,
const
int16_t
*
weight_data
,
const
int32_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
int
task_id
)
{
int
h_step
=
UP_DIV
(
conv_param
->
output_h_
,
conv_param
->
thread_num_
);
int
h_start
=
h_step
*
task_id
;
int
h_end
=
MSMIN
(
h_start
+
h_step
,
conv_param
->
output_h_
);
int
out_multiplier
=
conv_param
->
conv_quant_arg_
.
quant_multiplier_
[
0
];
int
left_shift
=
conv_param
->
conv_quant_arg_
.
left_shift_
[
0
];
int
right_shift
=
conv_param
->
conv_quant_arg_
.
right_shift_
[
0
];
int
intput_zp
=
conv_param
->
conv_quant_arg_
.
input_quant_args_
[
0
].
zp_
;
int
output_zp
=
conv_param
->
conv_quant_arg_
.
output_quant_args_
[
0
].
zp_
;
int
acc_min
=
conv_param
->
conv_quant_arg_
.
out_act_min_
[
0
];
int
acc_max
=
conv_param
->
conv_quant_arg_
.
out_act_max_
[
0
];
for
(
int
b
=
0
;
b
<
conv_param
->
output_batch_
;
b
++
)
{
const
int8_t
*
src
=
input_data
+
b
*
conv_param
->
input_h_
*
conv_param
->
input_w_
*
conv_param
->
input_channel_
;
int8_t
*
dst
=
output_data
+
b
*
conv_param
->
output_h_
*
conv_param
->
output_w_
*
conv_param
->
output_channel_
;
for
(
int
oh
=
h_start
;
oh
<
h_end
;
oh
++
)
{
int8_t
*
dst_data
=
dst
+
oh
*
conv_param
->
output_w_
*
conv_param
->
output_channel_
;
int
ih_origin
=
oh
*
conv_param
->
stride_h_
-
conv_param
->
pad_u_
;
int
start_kh
=
MSMAX
(
0
,
UP_DIV
(
-
ih_origin
,
conv_param
->
dilation_h_
));
int
end_kh
=
MSMIN
(
conv_param
->
kernel_h_
,
UP_DIV
(
conv_param
->
input_h_
-
ih_origin
,
conv_param
->
dilation_h_
));
// init acc
for
(
int
ow
=
0
;
ow
<
conv_param
->
output_w_
;
ow
++
)
{
memcpy
(
row_buffer
+
ow
*
conv_param
->
output_channel_
,
bias_data
,
conv_param
->
output_channel_
*
sizeof
(
int32_t
));
}
for
(
int
kh
=
start_kh
;
kh
<
end_kh
;
kh
++
)
{
int
ih
=
ih_origin
+
conv_param
->
dilation_w_
*
kh
;
const
int8_t
*
src_kh
=
src
+
ih
*
conv_param
->
input_w_
*
conv_param
->
input_channel_
;
const
int16_t
*
weight_kh
=
weight_data
+
kh
*
conv_param
->
kernel_w_
*
conv_param
->
output_channel_
;
int
in_sw_step
=
conv_param
->
stride_w_
*
conv_param
->
input_channel_
;
for
(
int
kw
=
0
;
kw
<
conv_param
->
kernel_w_
;
kw
++
)
{
int
out_w_start
=
MSMAX
(
0
,
(
conv_param
->
pad_l_
-
conv_param
->
dilation_w_
*
kw
+
conv_param
->
stride_w_
-
1
)
/
conv_param
->
stride_w_
);
int
out_w_end
=
MSMIN
(
conv_param
->
output_w_
,
(
conv_param
->
input_w_
+
conv_param
->
pad_l_
-
conv_param
->
dilation_w_
*
kw
+
conv_param
->
stride_w_
-
1
)
/
conv_param
->
stride_w_
);
int32_t
*
acc_w
=
row_buffer
+
out_w_start
*
conv_param
->
output_channel_
;
int
iw_origin
=
(
out_w_start
*
conv_param
->
stride_w_
)
-
conv_param
->
pad_l_
+
conv_param
->
dilation_w_
*
kw
;
const
int8_t
*
src_kw
=
src_kh
+
iw_origin
*
conv_param
->
input_channel_
;
int
num_pixels
=
out_w_end
-
out_w_start
;
ConvDwInt8Row
(
acc_w
,
src_kw
,
weight_kh
,
num_pixels
,
conv_param
->
output_channel_
,
in_sw_step
,
intput_zp
);
weight_kh
+=
conv_param
->
output_channel_
;
}
}
// post func, acc int32 -> dst int8
ConvDwInt8Post
(
dst_data
,
row_buffer
,
conv_param
->
output_w_
*
conv_param
->
output_channel_
,
output_zp
,
out_multiplier
,
left_shift
,
right_shift
,
acc_min
,
acc_max
);
}
}
}
/*conv depthwise int8 end*/
/*conv depthwise sliding window int8 begin*/
void
DepthwiseBorderPixelInt8
(
int8_t
*
dst
,
const
int16_t
*
src
,
const
int16_t
*
weight
,
const
int32_t
*
bias
,
int
height
,
int
width
,
int
in_kh_step
,
int
in_kw_step
,
int
kernel_w
,
int
*
out_multiplier
,
int
*
left_shift
,
int
*
right_shift
,
int32_t
out_zp
,
int32_t
acc_min
,
int32_t
acc_max
,
...
...
@@ -153,8 +246,8 @@ void DepthwiseCenterInt8(int8_t *dst, const int16_t *src, const int16_t *weight,
}
#endif
void
ConvDwInt8
(
int8_t
*
output_data
,
const
int16_t
*
input_data
,
const
int16_t
*
weight_data
,
const
int32_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
int
task_id
)
{
void
ConvDw
SW
Int8
(
int8_t
*
output_data
,
const
int16_t
*
input_data
,
const
int16_t
*
weight_data
,
const
int32_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
int
task_id
)
{
const
int16_t
*
src
=
input_data
;
int8_t
*
dst
=
output_data
;
bool
per_channel
=
conv_param
->
conv_quant_arg_
.
per_channel_
&
FILTER_PER_CHANNEL
;
...
...
@@ -215,7 +308,7 @@ void ConvDwInt8(int8_t *output_data, const int16_t *input_data, const int16_t *w
}
// batch loop
// output nhwc4
}
/*conv depthwise int8 end*/
/*conv depthwise
sliding window
int8 end*/
/*deconv depthwise int8 begin*/
void
DeconvDepthwiseBorderPixelInt8
(
int32_t
*
dst
,
const
int16_t
*
src
,
const
int16_t
*
weight
,
int
height
,
int
width
,
...
...
mindspore/lite/nnacl/int8/conv_depthwise_int8.h
浏览文件 @
8d06c2b8
...
...
@@ -23,8 +23,12 @@
#ifdef __cplusplus
extern
"C"
{
#endif
void
ConvDwInt8
(
int8_t
*
output_data
,
const
int16_t
*
input_data
,
const
int16_t
*
weight_data
,
const
int32_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
int
task_id
);
void
ConvDwInt8
(
int8_t
*
output_data
,
int32_t
*
output_row
,
const
int8_t
*
input_data
,
const
int16_t
*
weight_data
,
const
int32_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
int
task_id
);
void
ConvDwSWInt8
(
int8_t
*
output_data
,
const
int16_t
*
input_data
,
const
int16_t
*
weight_data
,
const
int32_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
int
task_id
);
void
DeconvDwInt8
(
int8_t
*
output_data
,
int32_t
*
output_buffer
,
const
int16_t
*
input_data
,
const
int16_t
*
weight_data
,
const
int32_t
*
bias_data
,
const
ConvParameter
*
conv_param
,
const
SlidingWindowParam
*
sliding
,
...
...
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
浏览文件 @
8d06c2b8
...
...
@@ -15,6 +15,7 @@
*/
#include "src/runtime/kernel/arm/int8/convolution_depthwise_int8.h"
#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
...
...
@@ -29,10 +30,6 @@ using mindspore::schema::PrimitiveType_DepthwiseConv2D;
namespace
mindspore
::
kernel
{
ConvolutionDepthwiseInt8CPUKernel
::~
ConvolutionDepthwiseInt8CPUKernel
()
{
if
(
sliding
!=
nullptr
)
{
delete
sliding
;
sliding
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
...
...
@@ -42,63 +39,44 @@ ConvolutionDepthwiseInt8CPUKernel::~ConvolutionDepthwiseInt8CPUKernel() {
int
ConvolutionDepthwiseInt8CPUKernel
::
InitWeightBias
()
{
// init weight, int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
origin_weight
=
reinterpret_cast
<
int8_t
*>
(
weight_tensor
->
Data
());
int
OC4
=
UP_DIV
(
weight_tensor
->
Batch
(),
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
int
channel
=
weight_tensor
->
Batch
();
int
pack_weight_size
=
channel
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
auto
tmp_weight
=
reinterpret_cast
<
int8_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
int8_t
)));
if
(
tmp_weight
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
PackNCHWToNHWCInt8
(
origin_weight
,
tmp_weight
,
1
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
());
int
weight_zp
=
conv_param_
->
conv_quant_arg_
.
filter_quant_args_
[
0
].
zp_
;
packed_weight_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
int16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
PackDepthwiseInt8Weight
(
origin_weight
,
packed_weight_
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
(),
&
(
conv_param_
->
conv_quant_arg_
));
for
(
int
i
=
0
;
i
<
weight_tensor
->
ElementsNum
();
i
++
)
{
packed_weight_
[
i
]
=
(
int16_t
)(
tmp_weight
[
i
]
-
weight_zp
);
}
bias_data_
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
C4NUM
*
OC4
*
sizeof
(
int32_t
)));
bias_data_
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
channel
*
sizeof
(
int32_t
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
bias_data_
,
0
,
C4NUM
*
OC4
*
sizeof
(
int32_t
));
memset
(
bias_data_
,
0
,
channel
*
sizeof
(
int32_t
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
bias_tensor
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
bias_tensor
->
ElementsNum
()
*
sizeof
(
int32_t
));
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC4
);
return
RET_OK
;
}
int
ConvolutionDepthwiseInt8CPUKernel
::
InitBuffer
()
{
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C4NUM
*
UP_DIV
(
conv_param_
->
input_channel_
,
4
);
packed_input_
=
reinterpret_cast
<
int16_t
*>
(
context_
->
allocator
->
Malloc
(
pack_input_size
*
sizeof
(
int16_t
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
if
(
conv_param_
->
input_channel_
%
C4NUM
!=
0
)
{
need_align_
=
true
;
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C4NUM
*
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
packed_output_
=
reinterpret_cast
<
int8_t
*>
(
context_
->
allocator
->
Malloc
(
pack_output_size
*
sizeof
(
int8_t
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
}
return
RET_OK
;
}
int
ConvolutionDepthwiseInt8CPUKernel
::
Init
()
{
sliding
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param."
;
return
RET_ERROR
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
...
...
@@ -107,13 +85,12 @@ int ConvolutionDepthwiseInt8CPUKernel::Init() {
int
ConvolutionDepthwiseInt8CPUKernel
::
ReSize
()
{
ConvolutionBaseCPUKernel
::
Init
();
InitSlidingParamConvDw
(
sliding
,
conv_param_
,
C4NUM
);
auto
ret
=
ConvolutionBaseCPUKernel
::
SetQuantParam
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Set quant param failed."
;
return
ret
;
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
conv_param_
->
output_h_
);
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise int8 InitWeightBias error!"
;
...
...
@@ -123,8 +100,9 @@ int ConvolutionDepthwiseInt8CPUKernel::ReSize() {
}
int
ConvolutionDepthwiseInt8CPUKernel
::
Execute
(
int
task_id
)
{
ConvDwInt8
(
packed_output_
,
packed_input_
,
packed_weight_
,
reinterpret_cast
<
int32_t
*>
(
bias_data_
),
conv_param_
,
sliding
,
task_id
);
auto
buffer
=
row_buffer_
+
conv_param_
->
output_w_
*
conv_param_
->
output_channel_
*
task_id
;
ConvDwInt8
(
output_ptr_
,
buffer
,
input_ptr_
,
packed_weight_
,
reinterpret_cast
<
int32_t
*>
(
bias_data_
),
conv_param_
,
task_id
);
return
RET_OK
;
}
...
...
@@ -138,6 +116,16 @@ int ConvDwInt8Run(void *cdata, int task_id) {
return
RET_OK
;
}
int
ConvolutionDepthwiseInt8CPUKernel
::
InitBuffer
()
{
int
output_row_size
=
conv_param_
->
thread_num_
*
conv_param_
->
output_w_
*
conv_param_
->
output_channel_
;
row_buffer_
=
reinterpret_cast
<
int32_t
*>
(
context_
->
allocator
->
Malloc
(
output_row_size
*
sizeof
(
float
)));
if
(
row_buffer_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
ConvolutionDepthwiseInt8CPUKernel
::
Run
()
{
if
(
conv_param_
->
input_channel_
!=
conv_param_
->
output_channel_
)
{
MS_LOG
(
ERROR
)
<<
"Only support input channel equals output channel."
;
...
...
@@ -156,13 +144,10 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
}
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
input_addr
=
reinterpret_cast
<
int8_t
*>
(
input_tensor
->
Data
());
PackDepthwiseInt8Input
(
input_addr
,
packed_input_
,
conv_param_
);
input_ptr_
=
reinterpret_cast
<
int8_t
*>
(
input_tensor
->
Data
());
auto
output_addr
=
reinterpret_cast
<
int8_t
*>
(
out_tensors_
.
at
(
kOutputIndex
)
->
Data
());
if
(
!
need_align_
)
{
packed_output_
=
output_addr
;
}
auto
output_tensor
=
out_tensors_
.
at
(
kOutputIndex
);
output_ptr_
=
reinterpret_cast
<
int8_t
*>
(
output_tensor
->
Data
());
ret
=
ParallelLaunch
(
THREAD_POOL_DEFAULT
,
ConvDwInt8Run
,
this
,
conv_param_
->
thread_num_
);
if
(
ret
!=
RET_OK
)
{
...
...
@@ -170,12 +155,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
return
RET_ERROR
;
}
if
(
need_align_
)
{
PackNHWC4ToNHWCInt8
(
packed_output_
,
output_addr
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
context_
->
allocator
->
Free
(
packed_output_
);
}
context_
->
allocator
->
Free
(
packed_input_
);
context_
->
allocator
->
Free
(
row_buffer_
);
return
RET_OK
;
}
...
...
@@ -186,8 +166,14 @@ kernel::LiteKernel *CpuConvDwInt8KernelCreator(const std::vector<lite::tensor::T
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
{
MS_ASSERT
(
opParameter
!=
nullptr
);
MS_ASSERT
(
desc
.
type
==
schema
::
PrimitiveType_DepthwiseConv2D
);
auto
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionDepthwiseInt8CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
kernel
::
LiteKernel
*
kernel
;
auto
filter_quant_size
=
inputs
[
kWeightIndex
]
->
GetQuantParams
().
size
();
if
(
filter_quant_size
==
1
)
{
// per tensor
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionDepthwiseInt8CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
}
else
{
// per channel
kernel
=
new
(
std
::
nothrow
)
kernel
::
ConvolutionDepthwiseSWInt8CPUKernel
(
opParameter
,
inputs
,
outputs
,
ctx
,
primitive
);
}
if
(
kernel
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"kernel is nullptr."
;
return
nullptr
;
...
...
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.h
浏览文件 @
8d06c2b8
...
...
@@ -36,15 +36,14 @@ class ConvolutionDepthwiseInt8CPUKernel : public ConvolutionBaseCPUKernel {
int
Run
()
override
;
int
InitWeightBias
();
int
InitBuffer
();
int
Execute
(
int
task_id
);
private:
SlidingWindowParam
*
sliding
=
nullptr
;
int
InitBuffer
()
;
int16_t
*
packed_weight_
=
nullptr
;
int
16_t
*
packed_input
_
=
nullptr
;
int8_t
*
packed_output
_
=
nullptr
;
bool
need_align_
=
false
;
int
8_t
*
input_ptr
_
=
nullptr
;
int8_t
*
output_ptr
_
=
nullptr
;
int32_t
*
row_buffer_
=
nullptr
;
};
}
// namespace mindspore::kernel
...
...
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.cc
0 → 100644
浏览文件 @
8d06c2b8
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h"
#include "schema/model_generated.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "nnacl/int8/conv_depthwise_int8.h"
#include "src/runtime/runtime_api.h"
using
mindspore
::
kernel
::
KERNEL_ARCH
::
kCPU
;
using
mindspore
::
lite
::
KernelRegistrar
;
using
mindspore
::
lite
::
RET_ERROR
;
using
mindspore
::
lite
::
RET_OK
;
using
mindspore
::
schema
::
PrimitiveType_DepthwiseConv2D
;
namespace
mindspore
::
kernel
{
ConvolutionDepthwiseSWInt8CPUKernel
::~
ConvolutionDepthwiseSWInt8CPUKernel
()
{
if
(
sliding
!=
nullptr
)
{
delete
sliding
;
sliding
=
nullptr
;
}
if
(
packed_weight_
!=
nullptr
)
{
free
(
packed_weight_
);
packed_weight_
=
nullptr
;
}
FreeQuantParam
();
}
int
ConvolutionDepthwiseSWInt8CPUKernel
::
InitWeightBias
()
{
// init weight, int8 -> int16
// o, h, w, i -> o/8, h, w, i, 8; o == group, i == 1
auto
weight_tensor
=
in_tensors_
[
kWeightIndex
];
auto
origin_weight
=
reinterpret_cast
<
int8_t
*>
(
weight_tensor
->
Data
());
int
OC4
=
UP_DIV
(
weight_tensor
->
Batch
(),
C4NUM
);
int
pack_weight_size
=
C4NUM
*
OC4
*
weight_tensor
->
Height
()
*
weight_tensor
->
Width
();
packed_weight_
=
reinterpret_cast
<
int16_t
*>
(
malloc
(
pack_weight_size
*
sizeof
(
int16_t
)));
if
(
packed_weight_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
PackDepthwiseInt8Weight
(
origin_weight
,
packed_weight_
,
weight_tensor
->
Height
()
*
weight_tensor
->
Width
(),
weight_tensor
->
Batch
(),
&
(
conv_param_
->
conv_quant_arg_
));
bias_data_
=
reinterpret_cast
<
int32_t
*>
(
malloc
(
C4NUM
*
OC4
*
sizeof
(
int32_t
)));
if
(
bias_data_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
memset
(
bias_data_
,
0
,
C4NUM
*
OC4
*
sizeof
(
int32_t
));
if
(
in_tensors_
.
size
()
==
kInputSize2
)
{
auto
bias_tensor
=
in_tensors_
.
at
(
kBiasIndex
);
auto
ori_bias
=
reinterpret_cast
<
int32_t
*>
(
bias_tensor
->
Data
());
memcpy
(
bias_data_
,
ori_bias
,
bias_tensor
->
ElementsNum
()
*
sizeof
(
int32_t
));
}
conv_param_
->
thread_num_
=
MSMIN
(
thread_count_
,
OC4
);
return
RET_OK
;
}
int
ConvolutionDepthwiseSWInt8CPUKernel
::
InitBuffer
()
{
int
pack_input_size
=
conv_param_
->
input_batch_
*
conv_param_
->
input_h_
*
conv_param_
->
input_w_
*
C4NUM
*
UP_DIV
(
conv_param_
->
input_channel_
,
4
);
packed_input_
=
reinterpret_cast
<
int16_t
*>
(
context_
->
allocator
->
Malloc
(
pack_input_size
*
sizeof
(
int16_t
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
if
(
conv_param_
->
input_channel_
%
C4NUM
!=
0
)
{
need_align_
=
true
;
int
pack_output_size
=
conv_param_
->
output_batch_
*
conv_param_
->
output_h_
*
conv_param_
->
output_w_
*
C4NUM
*
UP_DIV
(
conv_param_
->
output_channel_
,
C4NUM
);
packed_output_
=
reinterpret_cast
<
int8_t
*>
(
context_
->
allocator
->
Malloc
(
pack_output_size
*
sizeof
(
int8_t
)));
if
(
packed_input_
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"Malloc buffer failed."
;
return
RET_ERROR
;
}
}
return
RET_OK
;
}
int
ConvolutionDepthwiseSWInt8CPUKernel
::
Init
()
{
sliding
=
new
(
std
::
nothrow
)
SlidingWindowParam
;
if
(
sliding
==
nullptr
)
{
MS_LOG
(
ERROR
)
<<
"new sliding window param."
;
return
RET_ERROR
;
}
if
(
!
InferShapeDone
())
{
return
RET_OK
;
}
return
ReSize
();
}
int
ConvolutionDepthwiseSWInt8CPUKernel
::
ReSize
()
{
ConvolutionBaseCPUKernel
::
Init
();
InitSlidingParamConvDw
(
sliding
,
conv_param_
,
C4NUM
);
auto
ret
=
ConvolutionBaseCPUKernel
::
SetQuantParam
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Set quant param failed."
;
return
ret
;
}
ret
=
InitWeightBias
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise int8 InitWeightBias error!"
;
return
ret
;
}
return
RET_OK
;
}
int
ConvolutionDepthwiseSWInt8CPUKernel
::
Execute
(
int
task_id
)
{
ConvDwSWInt8
(
packed_output_
,
packed_input_
,
packed_weight_
,
reinterpret_cast
<
int32_t
*>
(
bias_data_
),
conv_param_
,
sliding
,
task_id
);
return
RET_OK
;
}
int
ConvDwSWInt8Run
(
void
*
cdata
,
int
task_id
)
{
auto
conv_dw_int8
=
reinterpret_cast
<
ConvolutionDepthwiseSWInt8CPUKernel
*>
(
cdata
);
auto
ret
=
conv_dw_int8
->
Execute
(
task_id
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvolutionDepthwiseSWInt8Run error task_id["
<<
task_id
<<
"] error_code["
<<
ret
<<
"]"
;
return
RET_ERROR
;
}
return
RET_OK
;
}
int
ConvolutionDepthwiseSWInt8CPUKernel
::
Run
()
{
if
(
conv_param_
->
input_channel_
!=
conv_param_
->
output_channel_
)
{
MS_LOG
(
ERROR
)
<<
"Only support input channel equals output channel."
;
return
RET_ERROR
;
}
auto
ret
=
Prepare
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Prepare failed."
;
return
RET_ERROR
;
}
ret
=
InitBuffer
();
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"Depthwise int8 ReSize error!"
;
return
ret
;
}
auto
input_tensor
=
in_tensors_
.
at
(
kInputIndex
);
auto
input_addr
=
reinterpret_cast
<
int8_t
*>
(
input_tensor
->
Data
());
PackDepthwiseInt8Input
(
input_addr
,
packed_input_
,
conv_param_
);
auto
output_addr
=
reinterpret_cast
<
int8_t
*>
(
out_tensors_
.
at
(
kOutputIndex
)
->
Data
());
if
(
!
need_align_
)
{
packed_output_
=
output_addr
;
}
ret
=
ParallelLaunch
(
THREAD_POOL_DEFAULT
,
ConvDwSWInt8Run
,
this
,
conv_param_
->
thread_num_
);
if
(
ret
!=
RET_OK
)
{
MS_LOG
(
ERROR
)
<<
"ConvDwSWInt8Run error: error_code["
<<
ret
<<
"]"
;
return
RET_ERROR
;
}
if
(
need_align_
)
{
PackNHWC4ToNHWCInt8
(
packed_output_
,
output_addr
,
conv_param_
->
output_batch_
,
conv_param_
->
output_h_
*
conv_param_
->
output_w_
,
conv_param_
->
output_channel_
);
context_
->
allocator
->
Free
(
packed_output_
);
}
context_
->
allocator
->
Free
(
packed_input_
);
return
RET_OK
;
}
}
// namespace mindspore::kernel
mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_slidewindow_int8.h
0 → 100644
浏览文件 @
8d06c2b8
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
#include <vector>
#include "src/lite_kernel.h"
#include "src/runtime/kernel/arm/base/convolution_base.h"
#include "nnacl/fp32/conv_depthwise.h"
namespace
mindspore
::
kernel
{
class
ConvolutionDepthwiseSWInt8CPUKernel
:
public
ConvolutionBaseCPUKernel
{
public:
ConvolutionDepthwiseSWInt8CPUKernel
(
OpParameter
*
parameter
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
inputs
,
const
std
::
vector
<
lite
::
tensor
::
Tensor
*>
&
outputs
,
const
Context
*
ctx
,
const
mindspore
::
lite
::
PrimitiveC
*
primitive
)
:
ConvolutionBaseCPUKernel
(
parameter
,
inputs
,
outputs
,
ctx
,
primitive
)
{}
~
ConvolutionDepthwiseSWInt8CPUKernel
()
override
;
int
Init
()
override
;
int
ReSize
()
override
;
int
Run
()
override
;
int
InitWeightBias
();
int
InitBuffer
();
int
Execute
(
int
task_id
);
private:
SlidingWindowParam
*
sliding
=
nullptr
;
int16_t
*
packed_weight_
=
nullptr
;
int16_t
*
packed_input_
=
nullptr
;
int8_t
*
packed_output_
=
nullptr
;
bool
need_align_
=
false
;
};
}
// namespace mindspore::kernel
#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_DEPTHWISE_SLIDEWINDOW_INT8_H_
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录