Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
f830e839
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f830e839
编写于
10月 23, 2018
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/paddle-mobile
into dev-latest
上级
022f1291
4ad46778
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
918 addition
and
371 deletion
+918
-371
src/common/types.cpp
src/common/types.cpp
+2
-0
src/fpga/bias_scale.cpp
src/fpga/bias_scale.cpp
+0
-3
src/framework/load_ops.h
src/framework/load_ops.h
+3
-0
src/operators/kernel/arm/polygon_box_transform_kernel.cpp
src/operators/kernel/arm/polygon_box_transform_kernel.cpp
+38
-0
src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
.../kernel/central-arm-func/polygon_box_transform_arm_func.h
+53
-0
src/operators/kernel/polygon_box_transform_kernel.h
src/operators/kernel/polygon_box_transform_kernel.h
+36
-0
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+1
-1
src/operators/math/gemm.h
src/operators/math/gemm.h
+6
-0
src/operators/math/gemm_int8.cpp
src/operators/math/gemm_int8.cpp
+517
-366
src/operators/op_param.h
src/operators/op_param.h
+26
-0
src/operators/polygon_box_transform_op.cpp
src/operators/polygon_box_transform_op.cpp
+45
-0
src/operators/polygon_box_transform_op.h
src/operators/polygon_box_transform_op.h
+56
-0
test/CMakeLists.txt
test/CMakeLists.txt
+4
-0
test/operators/test_mul_op.cpp
test/operators/test_mul_op.cpp
+0
-1
test/operators/test_polygon_box_transform_op.cpp
test/operators/test_polygon_box_transform_op.cpp
+126
-0
tools/op.cmake
tools/op.cmake
+5
-0
未找到文件。
src/common/types.cpp
浏览文件 @
f830e839
...
@@ -34,6 +34,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
...
@@ -34,6 +34,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const
char
*
G_OP_TYPE_LRN
=
"lrn"
;
const
char
*
G_OP_TYPE_LRN
=
"lrn"
;
const
char
*
G_OP_TYPE_MUL
=
"mul"
;
const
char
*
G_OP_TYPE_MUL
=
"mul"
;
const
char
*
G_OP_TYPE_MULTICLASS_NMS
=
"multiclass_nms"
;
const
char
*
G_OP_TYPE_MULTICLASS_NMS
=
"multiclass_nms"
;
const
char
*
G_OP_TYPE_POLYGON_BOX_TRANSFORM
=
"polygon_box_transform"
;
const
char
*
G_OP_TYPE_POOL2D
=
"pool2d"
;
const
char
*
G_OP_TYPE_POOL2D
=
"pool2d"
;
const
char
*
G_OP_TYPE_PRIOR_BOX
=
"prior_box"
;
const
char
*
G_OP_TYPE_PRIOR_BOX
=
"prior_box"
;
const
char
*
G_OP_TYPE_RELU
=
"relu"
;
const
char
*
G_OP_TYPE_RELU
=
"relu"
;
...
@@ -94,6 +95,7 @@ std::unordered_map<
...
@@ -94,6 +95,7 @@ std::unordered_map<
{
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_PRIOR_BOX
,
{{
"Image"
,
"Input"
},
{
"Boxes"
,
"Variances"
}}},
{
G_OP_TYPE_PRIOR_BOX
,
{{
"Image"
,
"Input"
},
{
"Boxes"
,
"Variances"
}}},
{
G_OP_TYPE_MULTICLASS_NMS
,
{{
"BBoxes"
,
"Scores"
},
{
"Out"
}}},
{
G_OP_TYPE_MULTICLASS_NMS
,
{{
"BBoxes"
,
"Scores"
},
{
"Out"
}}},
{
G_OP_TYPE_POLYGON_BOX_TRANSFORM
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_FC
,
{{
"X"
,
"Y"
,
"Z"
},
{
"Out"
}}},
{
G_OP_TYPE_FC
,
{{
"X"
,
"Y"
,
"Z"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_DEPTHWISE_CONV
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_DEPTHWISE_CONV
,
{{
"Input"
},
{
"Output"
}}},
...
...
src/fpga/bias_scale.cpp
浏览文件 @
f830e839
...
@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
...
@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
(
num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
(
num
+
num_per_div_before_alignment
-
1
)
/
num_per_div_before_alignment
;
int
num_per_div_after_alignment
=
int
num_per_div_after_alignment
=
align_to_x
(
num_per_div_before_alignment
,
BS_NUM_ALIGNMENT
);
align_to_x
(
num_per_div_before_alignment
,
BS_NUM_ALIGNMENT
);
if
(
num_per_div_before_alignment
==
num_per_div_after_alignment
)
{
return
;
}
int
num_element
=
int
num_element
=
2
*
div_num
*
num_per_div_after_alignment
;
// including bias & scale
2
*
div_num
*
num_per_div_after_alignment
;
// including bias & scale
float
*
ptr_aligned
=
float
*
ptr_aligned
=
...
...
src/framework/load_ops.h
浏览文件 @
f830e839
...
@@ -199,6 +199,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
...
@@ -199,6 +199,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
#ifdef MULTICLASSNMS_OP
#ifdef MULTICLASSNMS_OP
LOAD_OP1
(
multiclass_nms
,
CPU
);
LOAD_OP1
(
multiclass_nms
,
CPU
);
#endif
#endif
#ifdef POLYGONBOXTRANSFORM_OP
LOAD_OP1
(
polygon_box_transform
,
CPU
);
#endif
#ifdef SUM_OP
#ifdef SUM_OP
LOAD_OP1
(
sum
,
CPU
);
LOAD_OP1
(
sum
,
CPU
);
#endif
#endif
...
...
src/operators/kernel/arm/polygon_box_transform_kernel.cpp
0 → 100644
浏览文件 @
f830e839
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#include "operators/kernel/polygon_box_transform_kernel.h"
#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
PolygonBoxTransformKernel
<
CPU
,
float
>::
Init
(
PolygonBoxTransformParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
PolygonBoxTransformKernel
<
CPU
,
float
>::
Compute
(
const
PolygonBoxTransformParam
<
CPU
>
&
param
)
const
{
PolygonBoxTransformCompute
<
float
>
(
param
);
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
0 → 100644
浏览文件 @
f830e839
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
PolygonBoxTransformCompute
(
const
PolygonBoxTransformParam
<
CPU
>&
param
)
{
const
auto
*
input
=
param
.
Input
();
const
auto
&
input_dims
=
input
->
dims
();
const
auto
*
input_data
=
input
->
data
<
float
>
();
auto
*
output
=
param
.
Output
();
auto
*
output_data
=
output
->
mutable_data
<
float
>
();
int64_t
batch_size
=
input_dims
[
0
];
int64_t
geo_channel
=
input_dims
[
1
];
int64_t
height
=
input_dims
[
2
];
int64_t
width
=
input_dims
[
3
];
int64_t
id
=
0
;
for
(
int64_t
id_n
=
0
;
id_n
<
batch_size
*
geo_channel
;
++
id_n
)
{
for
(
int64_t
id_h
=
0
;
id_h
<
height
;
++
id_h
)
{
for
(
int64_t
id_w
=
0
;
id_w
<
width
;
++
id_w
)
{
id
=
id_n
*
height
*
width
+
width
*
id_h
+
id_w
;
if
(
id_n
%
2
==
0
)
{
output_data
[
id
]
=
id_w
*
4
-
input_data
[
id
];
}
else
{
output_data
[
id
]
=
id_h
*
4
-
input_data
[
id
];
}
}
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/polygon_box_transform_kernel.h
0 → 100644
浏览文件 @
f830e839
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
PolygonBoxTransformKernel
:
public
framework
::
OpKernelBase
<
DeviceType
,
PolygonBoxTransformParam
<
DeviceType
>>
{
public:
void
Compute
(
const
PolygonBoxTransformParam
<
DeviceType
>&
param
)
const
;
bool
Init
(
PolygonBoxTransformParam
<
DeviceType
>*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/gemm.cpp
浏览文件 @
f830e839
...
@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
...
@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
// 对 B 分块
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
NC
==
0
)
{
if
(
NC
==
0
)
{
NC
=
=
NR
;
NC
=
NR
;
}
else
{
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
...
...
src/operators/math/gemm.h
浏览文件 @
f830e839
...
@@ -22,9 +22,11 @@ limitations under the License. */
...
@@ -22,9 +22,11 @@ limitations under the License. */
#define C(i, j) C[(i)*ldc + (j)]
#define C(i, j) C[(i)*ldc + (j)]
#if __aarch64__
#if __aarch64__
#define MR_INT8 4
#define MR 6
#define MR 6
#define NR 16
#define NR 16
#else
#else
#define MR_INT8 4
#define MR 6
#define MR 6
#define NR 8
#define NR 8
#endif
#endif
...
@@ -189,6 +191,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -189,6 +191,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits function cluster begins
// 8 bits function cluster begins
// 8 bits int small block inner product
// 8 bits int small block inner product
void
AddDot4x8
(
int32_t
k
,
const
int8_t
*
a
,
const
int8_t
*
b
,
int32_t
*
c
,
int32_t
ldc
);
void
AddDot6x8
(
int32_t
k
,
const
int8_t
*
a
,
const
int8_t
*
b
,
int32_t
*
c
,
void
AddDot6x8
(
int32_t
k
,
const
int8_t
*
a
,
const
int8_t
*
b
,
int32_t
*
c
,
int32_t
ldc
);
int32_t
ldc
);
...
@@ -199,6 +203,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -199,6 +203,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
int8_t
*
bias
);
int8_t
*
bias
);
// 8 bits int pack function
// 8 bits int pack function
void
PackMatrixA_4r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
int32_t
lda
,
int8_t
*
buffer
);
void
PackMatrixA_6r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
void
PackMatrixA_6r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
int32_t
lda
,
int8_t
*
buffer
);
int32_t
lda
,
int8_t
*
buffer
);
void
PackMatrixB_8c
(
int32_t
k
,
int32_t
n
,
int32_t
n_tail
,
const
int8_t
*
B
,
void
PackMatrixB_8c
(
int32_t
k
,
int32_t
n
,
int32_t
n_tail
,
const
int8_t
*
B
,
...
...
src/operators/math/gemm_int8.cpp
浏览文件 @
f830e839
...
@@ -26,11 +26,228 @@ limitations under the License. */
...
@@ -26,11 +26,228 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
void
Gemm
::
AddDot4x8
(
int32_t
k
,
const
int8_t
*
a
,
const
int8_t
*
b
,
int32_t
*
c
,
int32_t
ldc
)
{
#if __ARM_NEON
#if __aarch64__
// TODO
#else
const
int8_t
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int32_t
kc1
=
k
>>
3
;
int32_t
kc2
=
k
&
7
;
int32_t
kc3
=
kc2
>>
2
;
int32_t
kc4
=
kc2
&
3
;
int32_t
kc5
=
kc4
>>
1
;
int32_t
kc6
=
kc4
&
1
;
int32_t
step
=
sizeof
(
int32_t
)
*
ldc
;
asm
volatile
(
// q8-q15: save 32 results
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vmov.s32 q8, #0
\n\t
"
"vmov.s32 q9, q8
\n\t
"
"vmov.s32 q10, q8
\n\t
"
"vmov.s32 q11, q8
\n\t
"
"vmov.s32 q12, q8
\n\t
"
"vmov.s32 q13, q8
\n\t
"
"vmov.s32 q14, q8
\n\t
"
"vmov.s32 q15, q8
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt 1f
\n\t
"
"0:
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.s8 {d0-d3}, [%[a_ptr]]!
\n\t
"
// load A 8 cols
"vld1.s8 {d8-d11}, [%[b_ptr]]!
\n\t
"
// load B first 4 rows
"vmovl.s8 q2, d0
\n\t
"
// process B first 4
// rows
"vmovl.s8 q3, d8
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vmovl.s8 q3, d9
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"vld1.s8 {d12-d15}, [%[b_ptr]]!
\n\t
"
// load B second 4
// rows
"vmovl.s8 q2, d1
\n\t
"
"vmovl.s8 q3, d10
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vmovl.s8 q3, d11
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"vmovl.s8 q2, d2
\n\t
"
// process B second 4
// rows
"vmovl.s8 q3, d12
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vmovl.s8 q3, d13
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"vmovl.s8 q2, d3
\n\t
"
"vmovl.s8 q3, d14
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vmovl.s8 q3, d15
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 0b
\n\t
"
"1:
\n\t
"
// last 4 rows
"subs %[kc3], %[kc3], #1
\n\t
"
"blt 2f
\n\t
"
"vld1.s8 {d0-d1}, [%[a_ptr]]!
\n\t
"
// load A 4 cols
"vld1.s8 {d8-d11}, [%[b_ptr]]!
\n\t
"
// load B 4 rows
"vmovl.s8 q2, d0
\n\t
"
"vmovl.s8 q3, d8
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vmovl.s8 q3, d9
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"vmovl.s8 q2, d1
\n\t
"
"vmovl.s8 q3, d10
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vmovl.s8 q3, d11
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"2:
\n\t
"
// last 2 rows
"subs %[kc5], %[kc5], #1
\n\t
"
"blt 3f
\n\t
"
"vld1.s8 {d0}, [%[a_ptr]]!
\n\t
"
// load A 2 cols
"vld1.s8 {d8-d9}, [%[b_ptr]]!
\n\t
"
// load B 2 rows
"vmovl.s8 q2, d0
\n\t
"
"vmovl.s8 q3, d8
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vmovl.s8 q3, d9
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"3:
\n\t
"
// last 1 row
"subs %[kc6], %[kc6], #1
\n\t
"
"blt 4f
\n\t
"
"vld1.s8 {d0}, [%[a_ptr]]
\n\t
"
// load A 1 col
"vld1.s8 {d8}, [%[b_ptr]]
\n\t
"
// load B 1 row
"vmovl.s8 q2, d0
\n\t
"
"vmovl.s8 q3, d8
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"4:
\n\t
"
"vst1.32 {q8, q9}, [%[c]], %[step]
\n\t
"
"vst1.32 {q10, q11}, [%[c]], %[step]
\n\t
"
"vst1.32 {q12, q13}, [%[c]], %[step]
\n\t
"
"vst1.32 {q14, q15}, [%[c]]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc3
]
"r"
(
kc3
),
[
kc5
]
"r"
(
kc5
),
[
kc6
]
"r"
(
kc6
),
[
step
]
"r"
(
step
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
#endif // __aarch64__
#endif // __ARM_NEON
}
// 8 bits int small block inner product
// 8 bits int small block inner product
void
Gemm
::
AddDot6x8
(
int32_t
k
,
const
int8_t
*
a
,
const
int8_t
*
b
,
int32_t
*
c
,
void
Gemm
::
AddDot6x8
(
int32_t
k
,
const
int8_t
*
a
,
const
int8_t
*
b
,
int32_t
*
c
,
int32_t
ldc
)
{
int32_t
ldc
)
{
#if __ARM_NEON
#if __ARM_NEON
#if __aarch64__
// TODO
#else
const
int8_t
*
a_ptr
,
*
b_ptr
;
const
int8_t
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
a_ptr
=
a
;
b_ptr
=
b
;
b_ptr
=
b
;
...
@@ -46,383 +263,265 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
...
@@ -46,383 +263,265 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"pld [%[a_ptr]]
\n\t
"
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vmov.s
8
q4, #0
\n\t
"
"vmov.s
32
q4, #0
\n\t
"
"vmov.s
8 q5, #0
\n\t
"
"vmov.s
32 q5, q4
\n\t
"
"vmov.s
8 q6, #0
\n\t
"
"vmov.s
32 q6, q4
\n\t
"
"vmov.s
8 q7, #0
\n\t
"
"vmov.s
32 q7, q4
\n\t
"
"vmov.s
8 q8, #0
\n\t
"
"vmov.s
32 q8, q4
\n\t
"
"vmov.s
8 q9, #0
\n\t
"
"vmov.s
32 q9, q4
\n\t
"
"vmov.s
8 q10, #0
\n\t
"
"vmov.s
32 q10, q4
\n\t
"
"vmov.s
8 q11, #0
\n\t
"
"vmov.s
32 q11, q4
\n\t
"
"vmov.s
8 q12, #0
\n\t
"
"vmov.s
32 q12, q4
\n\t
"
"vmov.s
8 q13, #0
\n\t
"
"vmov.s
32 q13, q4
\n\t
"
"vmov.s
8 q14, #0
\n\t
"
"vmov.s
32 q14, q4
\n\t
"
"vmov.s
8 q15, #0
\n\t
"
"vmov.s
32 q15, q4
\n\t
"
"mov r0, #12
\n\t
"
"mov r0, #12
\n\t
"
"subs %[kc1], %[kc1],
#1
\n\t
"
"subs %[kc1], %[kc1],
#1
\n\t
"
"blt 1f
\n\t
"
"blt 1f
\n\t
"
"0:
\n\t
"
"0:
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.s8 {d0-d2}, [%[a_ptr]]!
\n\t
"
// A 4 cols, q0 used,
"vld1.s8 {d0-d2}, [%[a_ptr]]!
\n\t
"
// A 4 cols
// 1/2 q3 used
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 1st row
"vmov.s8 q2, #0
\n\t
"
// q2 used
"vmovl.s8 q2, d0
\n\t
"
"vld1.s8 {d6-d7}, [%[b_ptr]]!
\n\t
"
// B 2 rows, B row1,
"vmovl.s8 q3, d3
\n\t
"
// q1
"vmlal.s16 q4, d6, d4[0]
\n\t
"
"vdup.s8 d3, d0[0]
\n\t
"
// q3 used // used
"vmlal.s16 q5, d7, d4[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
// A col00 * B row0
"vmlal.s16 q6, d6, d4[1]
\n\t
"
"vdup.s8 d3, d0[6]
\n\t
"
// q3 used
"vmlal.s16 q7, d7, d4[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
// A col10 * B row1,
"vmlal.s16 q8, d6, d4[2]
\n\t
"
// q3 free
"vmlal.s16 q9, d7, d4[2]
\n\t
"
"vaddw.s16 q4, q4, d4
\n\t
"
"vmlal.s16 q10, d6, d4[3]
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmlal.s16 q11, d7, d4[3]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q12, d6, d5[0]
\n\t
"
"vdup.s8 d3, d0[1]
\n\t
"
"vmlal.s16 q13, d7, d5[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q14, d6, d5[1]
\n\t
"
"vdup.s8 d3, d0[7]
\n\t
"
"vmlal.s16 q15, d7, d5[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 2nd row
"vaddw.s16 q6, q6, d4
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmlal.s16 q4, d6, d5[2]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q5, d7, d5[2]
\n\t
"
"vdup.s8 d3, d0[2]
\n\t
"
"vmlal.s16 q6, d6, d5[3]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q7, d7, d5[3]
\n\t
"
"vdup.s8 d3, d1[0]
\n\t
"
"vmovl.s8 q2, d1
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vdup.s8 d3, d0[3]
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vdup.s8 d3, d1[1]
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 3th row
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vmovl.s8 q3, d3
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q4, d6, d5[0]
\n\t
"
"vdup.s8 d3, d0[4]
\n\t
"
"vmlal.s16 q5, d7, d5[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q6, d6, d5[1]
\n\t
"
"vdup.s8 d3, d1[2]
\n\t
"
"vmlal.s16 q7, d7, d5[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q8, d6, d5[2]
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vmlal.s16 q9, d7, d5[2]
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vmlal.s16 q10, d6, d5[3]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q11, d7, d5[3]
\n\t
"
"vdup.s8 d3, d0[5]
\n\t
"
"vmovl.s8 q2, d2
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q12, d6, d4[0]
\n\t
"
"vdup.s8 d3, d1[3]
\n\t
"
"vmlal.s16 q13, d7, d4[0]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q14, d6, d4[1]
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vmlal.s16 q15, d7, d4[1]
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 5
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 4th row
"vmovl.s8 q3, d3
\n\t
"
"vld1.s8 {d6-d7}, [%[b_ptr]]!
\n\t
"
// B 2 rows, B row1,
"vmlal.s16 q4, d6, d4[2]
\n\t
"
// q1
"vmlal.s16 q5, d7, d4[2]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
// q2 used
"vmlal.s16 q6, d6, d4[3]
\n\t
"
"vdup.s8 d3, d1[4]
\n\t
"
// q3 used // used
"vmlal.s16 q7, d7, d4[3]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
// A col00 * B row0
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vdup.s8 d3, d2[2]
\n\t
"
// q3 used
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
// A col10 * B row1,
"vmlal.s16 q10, d6, d5[1]
\n\t
"
// q3 free
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vaddw.s16 q4, q4, d4
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vdup.s8 d3, d1[5]
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[3]
\n\t
"
"vld1.s8 {d0-d2}, [%[a_ptr]]!
\n\t
"
// A 4 cols
"vmlal.s8 q2, d7, d3
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 1st row
"vaddw.s16 q6, q6, d4
\n\t
"
"vmovl.s8 q2, d0
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmovl.s8 q3, d3
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q4, d6, d4[0]
\n\t
"
"vdup.s8 d3, d1[6]
\n\t
"
"vmlal.s16 q5, d7, d4[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q6, d6, d4[1]
\n\t
"
"vdup.s8 d3, d2[4]
\n\t
"
"vmlal.s16 q7, d7, d4[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q8, d6, d4[2]
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vmlal.s16 q9, d7, d4[2]
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmlal.s16 q10, d6, d4[3]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q11, d7, d4[3]
\n\t
"
"vdup.s8 d3, d1[7]
\n\t
"
"vmlal.s16 q12, d6, d5[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q13, d7, d5[0]
\n\t
"
"vdup.s8 d3, d2[5]
\n\t
"
"vmlal.s16 q14, d6, d5[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q15, d7, d5[1]
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 2nd row
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vmovl.s8 q3, d3
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q4, d6, d5[2]
\n\t
"
"vdup.s8 d3, d2[0]
\n\t
"
"vmlal.s16 q5, d7, d5[2]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q6, d6, d5[3]
\n\t
"
"vdup.s8 d3, d2[6]
\n\t
"
"vmlal.s16 q7, d7, d5[3]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmovl.s8 q2, d1
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vdup.s8 d3, d2[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vdup.s8 d3, d2[7]
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 5
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 3th row
"vmovl.s8 q3, d3
\n\t
"
"vld1.s8 {d0-d2}, [%[a_ptr]]!
\n\t
"
// A 4 cols, q0 used,
"vmlal.s16 q4, d6, d5[0]
\n\t
"
// 1/2 q3 used
"vmlal.s16 q5, d7, d5[0]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
// q2 used
"vmlal.s16 q6, d6, d5[1]
\n\t
"
"vld1.s8 {d6-d7}, [%[b_ptr]]!
\n\t
"
// B 2 rows, B row1,
"vmlal.s16 q7, d7, d5[1]
\n\t
"
// q1
"vmlal.s16 q8, d6, d5[2]
\n\t
"
"vdup.s8 d3, d0[0]
\n\t
"
// q3 used // used
"vmlal.s16 q9, d7, d5[2]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
// A col00 * B row0
"vmlal.s16 q10, d6, d5[3]
\n\t
"
"vdup.s8 d3, d0[6]
\n\t
"
// q3 used
"vmlal.s16 q11, d7, d5[3]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
// A col10 * B row1,
"vmovl.s8 q2, d2
\n\t
"
// q3 free
"vmlal.s16 q12, d6, d4[0]
\n\t
"
"vaddw.s16 q4, q4, d4
\n\t
"
"vmlal.s16 q13, d7, d4[0]
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmlal.s16 q14, d6, d4[1]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q15, d7, d4[1]
\n\t
"
"vdup.s8 d3, d0[1]
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 4th row
"vmlal.s8 q2, d6, d3
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vdup.s8 d3, d0[7]
\n\t
"
"vmlal.s16 q4, d6, d4[2]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q5, d7, d4[2]
\n\t
"
"vaddw.s16 q6, q6, d4
\n\t
"
"vmlal.s16 q6, d6, d4[3]
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmlal.s16 q7, d7, d4[3]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vdup.s8 d3, d0[2]
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vdup.s8 d3, d1[0]
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"vdup.s8 d3, d0[3]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d1[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d0[4]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d1[2]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d0[5]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d1[3]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 5
"vld1.s8 {d6-d7}, [%[b_ptr]]!
\n\t
"
// B 2 rows, B row1,
// q1
"vmov.s8 q2, #0
\n\t
"
// q2 used
"vdup.s8 d3, d1[4]
\n\t
"
// q3 used // used
"vmlal.s8 q2, d6, d3
\n\t
"
// A col00 * B row0
"vdup.s8 d3, d2[2]
\n\t
"
// q3 used
"vmlal.s8 q2, d7, d3
\n\t
"
// A col10 * B row1,
// q3 free
"vaddw.s16 q4, q4, d4
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d1[5]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[3]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q6, q6, d4
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d1[6]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[4]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d1[7]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[5]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d2[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[6]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d2[1]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[7]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 5
"subs %[kc1], %[kc1], #1
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 0b
\n\t
"
"bge 0b
\n\t
"
"1:
\n\t
"
// last <8 rows
"1:
\n\t
"
// last <8 rows
"subs %[kc3], %[kc3], #1
\n\t
"
"subs %[kc3], %[kc3], #1
\n\t
"
"blt 2f
\n\t
"
"blt 2f
\n\t
"
"vld1.s8 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.s8 {d0-d2}, [%[a_ptr]]!
\n\t
"
// A 4 cols
"vmov.s8 q2, #0
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 1st row
"vld1.s8 {d6-d7}, [%[b_ptr]]!
\n\t
"
"vmovl.s8 q2, d0
\n\t
"
"vdup.s8 d3, d0[0]
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q4, d6, d4[0]
\n\t
"
"vdup.s8 d3, d0[6]
\n\t
"
"vmlal.s16 q5, d7, d4[0]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q6, d6, d4[1]
\n\t
"
"vaddw.s16 q4, q4, d4
\n\t
"
"vmlal.s16 q7, d7, d4[1]
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmlal.s16 q8, d6, d4[2]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q9, d7, d4[2]
\n\t
"
"vdup.s8 d3, d0[1]
\n\t
"
"vmlal.s16 q10, d6, d4[3]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q11, d7, d4[3]
\n\t
"
"vdup.s8 d3, d0[7]
\n\t
"
"vmlal.s16 q12, d6, d5[0]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q13, d7, d5[0]
\n\t
"
"vaddw.s16 q6, q6, d4
\n\t
"
"vmlal.s16 q14, d6, d5[1]
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmlal.s16 q15, d7, d5[1]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 2nd row
"vdup.s8 d3, d0[2]
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q4, d6, d5[2]
\n\t
"
"vdup.s8 d3, d1[0]
\n\t
"
"vmlal.s16 q5, d7, d5[2]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q6, d6, d5[3]
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vmlal.s16 q7, d7, d5[3]
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmovl.s8 q2, d1
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vdup.s8 d3, d0[3]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vdup.s8 d3, d1[1]
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vdup.s8 d3, d0[4]
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 3th row
"vmlal.s8 q2, d6, d3
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vdup.s8 d3, d1[2]
\n\t
"
"vmlal.s16 q4, d6, d5[0]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q5, d7, d5[0]
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vmlal.s16 q6, d6, d5[1]
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vmlal.s16 q7, d7, d5[1]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q8, d6, d5[2]
\n\t
"
"vdup.s8 d3, d0[5]
\n\t
"
"vmlal.s16 q9, d7, d5[2]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q10, d6, d5[3]
\n\t
"
"vdup.s8 d3, d1[3]
\n\t
"
"vmlal.s16 q11, d7, d5[3]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmovl.s8 q2, d2
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vmlal.s16 q12, d6, d4[0]
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 5
"vmlal.s16 q13, d7, d4[0]
\n\t
"
"vmlal.s16 q14, d6, d4[1]
\n\t
"
"vld1.s8 {d6-d7}, [%[b_ptr]]!
\n\t
"
"vmlal.s16 q15, d7, d4[1]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 4th row
"vdup.s8 d3, d1[4]
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q4, d6, d4[2]
\n\t
"
"vdup.s8 d3, d2[2]
\n\t
"
"vmlal.s16 q5, d7, d4[2]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q6, d6, d4[3]
\n\t
"
"vaddw.s16 q4, q4, d4
\n\t
"
"vmlal.s16 q7, d7, d4[3]
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmlal.s16 q8, d6, d5[0]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q9, d7, d5[0]
\n\t
"
"vdup.s8 d3, d1[5]
\n\t
"
"vmlal.s16 q10, d6, d5[1]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vmlal.s16 q11, d7, d5[1]
\n\t
"
"vdup.s8 d3, d2[3]
\n\t
"
"vmlal.s16 q12, d6, d5[2]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vmlal.s16 q13, d7, d5[2]
\n\t
"
"vaddw.s16 q6, q6, d4
\n\t
"
"vmlal.s16 q14, d6, d5[3]
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmlal.s16 q15, d7, d5[3]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d1[6]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[4]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d1[7]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[5]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d2[0]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[6]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d3, d2[1]
\n\t
"
"vmlal.s8 q2, d6, d3
\n\t
"
"vdup.s8 d3, d2[7]
\n\t
"
"vmlal.s8 q2, d7, d3
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 5
"2:
\n\t
"
// last <4 rows
"2:
\n\t
"
// last <4 rows
"subs %[kc5], %[kc5], #1
\n\t
"
"subs %[kc5], %[kc5], #1
\n\t
"
"blt 3f
\n\t
"
"blt 3f
\n\t
"
"vld1.s8 {d0, d1}, [%[a_ptr]], r0
\n\t
"
"vld1.s8 {d0, d1}, [%[a_ptr]], r0
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 1st row
"vdup.s8 d6, d0[0]
\n\t
"
"vmovl.s8 q2, d0
\n\t
"
"vld1.s8 {d2-d3}, [%[b_ptr]]!
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vdup.s8 d7, d0[6]
\n\t
"
"vmlal.s16 q4, d6, d4[0]
\n\t
"
"vmlal.s8 q2, d2, d6
\n\t
"
"vmlal.s16 q5, d7, d4[0]
\n\t
"
"vmlal.s8 q2, d3, d7
\n\t
"
"vmlal.s16 q6, d6, d4[1]
\n\t
"
"vaddw.s16 q4, q4, d4
\n\t
"
"vmlal.s16 q7, d7, d4[1]
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmlal.s16 q8, d6, d4[2]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q9, d7, d4[2]
\n\t
"
"vdup.s8 d6, d0[1]
\n\t
"
"vmlal.s16 q10, d6, d4[3]
\n\t
"
"vdup.s8 d7, d0[7]
\n\t
"
"vmlal.s16 q11, d7, d4[3]
\n\t
"
"vmlal.s8 q2, d2, d6
\n\t
"
"vmlal.s16 q12, d6, d5[0]
\n\t
"
"vmlal.s8 q2, d3, d7
\n\t
"
"vmlal.s16 q13, d7, d5[0]
\n\t
"
"vaddw.s16 q6, q6, d4
\n\t
"
"vmlal.s16 q14, d6, d5[1]
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmlal.s16 q15, d7, d5[1]
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]!
\n\t
"
// B 2nd row
"vdup.s8 d6, d0[2]
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vdup.s8 d7, d1[0]
\n\t
"
"vmlal.s16 q4, d6, d5[2]
\n\t
"
"vmlal.s8 q2, d2, d6
\n\t
"
"vmlal.s16 q5, d7, d5[2]
\n\t
"
"vmlal.s8 q2, d3, d7
\n\t
"
"vmlal.s16 q6, d6, d5[3]
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vmlal.s16 q7, d7, d5[3]
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmovl.s8 q2, d1
\n\t
"
"vmov.s8 q2, #0
\n\t
"
"vmlal.s16 q8, d6, d4[0]
\n\t
"
"vdup.s8 d6, d0[3]
\n\t
"
"vmlal.s16 q9, d7, d4[0]
\n\t
"
"vdup.s8 d7, d1[1]
\n\t
"
"vmlal.s16 q10, d6, d4[1]
\n\t
"
"vmlal.s8 q2, d2, d6
\n\t
"
"vmlal.s16 q11, d7, d4[1]
\n\t
"
"vmlal.s8 q2, d3, d7
\n\t
"
"vmlal.s16 q12, d6, d4[2]
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vmlal.s16 q13, d7, d4[2]
\n\t
"
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vmlal.s16 q14, d6, d4[3]
\n\t
"
"vmov.s8 q2, #0.
\n\t
"
"vmlal.s16 q15, d7, d4[3]
\n\t
"
"vdup.s8 d6, d0[4]
\n\t
"
"vdup.s8 d7, d1[2]
\n\t
"
"vmlal.s8 q2, d2, d6
\n\t
"
"vmlal.s8 q2, d3, d7
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vmov.s8 q2, #0
\n\t
"
"vdup.s8 d6, d0[5]
\n\t
"
"vdup.s8 d7, d1[3]
\n\t
"
"vmlal.s8 q2, d2, d6
\n\t
"
"vmlal.s8 q2, d3, d7
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 5
"3:
\n\t
"
// last <2 rows
"3:
\n\t
"
// last <2 rows
"subs %[kc6], %[kc6], #1
\n\t
"
"subs %[kc6], %[kc6], #1
\n\t
"
"blt 4f
\n\t
"
"blt 4f
\n\t
"
"vld1.s8 {d0}, [%[a_ptr]]
\n\t
"
"vld1.s8 {d0}, [%[a_ptr]]
\n\t
"
"vld1.s8 {d1}, [%[b_ptr]]
\n\t
"
"vld1.s8 {d3}, [%[b_ptr]]
\n\t
"
"vdup.s8 d2, d0[0]
\n\t
"
"vmovl.s8 q2, d0
\n\t
"
"vmull.s8 q2, d1, d2
\n\t
"
"vmovl.s8 q3, d3
\n\t
"
"vaddw.s16 q4, q4, d4
\n\t
"
"vmlal.s16 q4, d6, d4[0]
\n\t
"
"vaddw.s16 q5, q5, d5
\n\t
"
// res row 0
"vmlal.s16 q5, d7, d4[0]
\n\t
"
"vdup.s8 d2, d0[1]
\n\t
"
"vmlal.s16 q6, d6, d4[1]
\n\t
"
"vmull.s8 q2, d1, d2
\n\t
"
"vmlal.s16 q7, d7, d4[1]
\n\t
"
"vaddw.s16 q6, q6, d4
\n\t
"
"vmlal.s16 q8, d6, d4[2]
\n\t
"
"vaddw.s16 q7, q7, d5
\n\t
"
// res row 1
"vmlal.s16 q9, d7, d4[2]
\n\t
"
"vdup.s8 d2, d0[2]
\n\t
"
"vmlal.s16 q10, d6, d4[3]
\n\t
"
"vmull.s8 q2, d1, d2
\n\t
"
"vmlal.s16 q11, d7, d4[3]
\n\t
"
"vaddw.s16 q8, q8, d4
\n\t
"
"vmlal.s16 q12, d6, d5[0]
\n\t
"
"vaddw.s16 q9, q9, d5
\n\t
"
// res row 2
"vmlal.s16 q13, d7, d5[0]
\n\t
"
"vdup.s8 d2, d0[3]
\n\t
"
"vmlal.s16 q14, d6, d5[1]
\n\t
"
"vmull.s8 q2, d1, d2
\n\t
"
"vmlal.s16 q15, d7, d5[1]
\n\t
"
"vaddw.s16 q10, q10, d4
\n\t
"
"vaddw.s16 q11, q11, d5
\n\t
"
// res row 3
"vdup.s8 d2, d0[4]
\n\t
"
"vmull.s8 q2, d1, d2
\n\t
"
"vaddw.s16 q12, q12, d4
\n\t
"
"vaddw.s16 q13, q13, d5
\n\t
"
// res row 4
"vdup.s8 d2, d0[5]
\n\t
"
"vmull.s8 q2, d1, d2
\n\t
"
"vaddw.s16 q14, q14, d4
\n\t
"
"vaddw.s16 q15, q15, d5
\n\t
"
// res row 4
"4:
\n\t
"
"4:
\n\t
"
"vst1.32 {q4, q5}, [%[c]], %[step]
\n\t
"
"vst1.32 {q4, q5}, [%[c]], %[step]
\n\t
"
"vst1.32 {q6, q7}, [%[c]], %[step]
\n\t
"
"vst1.32 {q6, q7}, [%[c]], %[step]
\n\t
"
...
@@ -435,7 +534,8 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
...
@@ -435,7 +534,8 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
[
kc3
]
"r"
(
kc3
),
[
kc5
]
"r"
(
kc5
),
[
kc6
]
"r"
(
kc6
),
[
step
]
"r"
(
step
)
[
kc3
]
"r"
(
kc3
),
[
kc5
]
"r"
(
kc5
),
[
kc6
]
"r"
(
kc6
),
[
step
]
"r"
(
step
)
:
"cc"
,
"memory"
,
"r0"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
:
"cc"
,
"memory"
,
"r0"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
#endif
#endif // __aarch64__
#endif // __ARM_NEON
}
}
// 8 bits int inner product
// 8 bits int inner product
...
@@ -445,8 +545,9 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
...
@@ -445,8 +545,9 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
int8_t
*
bias
)
{
int8_t
*
bias
)
{
#pragma omp parallel for
#pragma omp parallel for
for
(
int32_t
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int32_t
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int32_t
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
for
(
int32_t
i
=
0
;
i
<
mc
;
i
+=
MR_INT8
)
{
AddDot6x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
// AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
}
}
}
}
if
(
alpha
!=
1
)
{
if
(
alpha
!=
1
)
{
...
@@ -474,12 +575,53 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
...
@@ -474,12 +575,53 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
return
;
return
;
}
}
}
}
// 8 bits int PackMatrixA_4r
void
Gemm
::
PackMatrixA_4r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
int32_t
lda
,
int8_t
*
buffer
)
{
const
int8_t
*
a0
,
*
a1
,
*
a2
,
*
a3
;
for
(
int32_t
i
=
0
;
i
<
m
-
m_tail
;
i
+=
MR_INT8
)
{
a0
=
A
+
i
*
lda
;
a1
=
A
+
(
i
+
1
)
*
lda
;
a2
=
A
+
(
i
+
2
)
*
lda
;
a3
=
A
+
(
i
+
3
)
*
lda
;
for
(
int32_t
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a1
++
;
*
buffer
++
=
*
a2
++
;
*
buffer
++
=
*
a3
++
;
}
}
// 8 bits int PackMatrixA
if
(
m_tail
!=
0
)
{
a0
=
&
A
(
m
-
m_tail
,
0
);
a1
=
a0
+
lda
;
a2
=
a0
+
2
*
lda
;
a3
=
a0
+
3
*
lda
;
switch
(
m_tail
)
{
case
1
:
a1
=
zero_int8
;
case
2
:
a2
=
zero_int8
;
case
3
:
a3
=
zero_int8
;
break
;
default:
break
;
}
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
buffer
++
=
*
a0
++
;
*
buffer
++
=
*
a1
++
;
*
buffer
++
=
*
a2
++
;
*
buffer
++
=
*
a3
++
;
}
}
}
// 8 bits int PackMatrixA_6r
void
Gemm
::
PackMatrixA_6r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
void
Gemm
::
PackMatrixA_6r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
int32_t
lda
,
int8_t
*
buffer
)
{
int32_t
lda
,
int8_t
*
buffer
)
{
const
int32_t
i_length
=
m
-
m_tail
;
const
int32_t
i_length
=
m
-
m_tail
;
for
(
int32_t
i
=
0
;
i
<
i_length
;
i
+=
MR
)
{
for
(
int32_t
i
=
0
;
i
<
i_length
;
i
+=
MR
_INT8
)
{
const
int8_t
*
a0
=
A
+
i
*
lda
;
const
int8_t
*
a0
=
A
+
i
*
lda
;
const
int8_t
*
a1
=
A
+
(
i
+
1
)
*
lda
;
const
int8_t
*
a1
=
A
+
(
i
+
1
)
*
lda
;
const
int8_t
*
a2
=
A
+
(
i
+
2
)
*
lda
;
const
int8_t
*
a2
=
A
+
(
i
+
2
)
*
lda
;
...
@@ -539,6 +681,9 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
...
@@ -539,6 +681,9 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
for
(
int32_t
i
=
0
;
i
<
k
;
++
i
)
{
for
(
int32_t
i
=
0
;
i
<
k
;
++
i
)
{
const
int8_t
*
b0
=
&
B
(
i
,
j
);
const
int8_t
*
b0
=
&
B
(
i
,
j
);
#if __ARM_NEON
#if __ARM_NEON
#if __aarch64__
// TODO
#else
asm
volatile
(
asm
volatile
(
// "pld [%[b0]] \n\t"
// "pld [%[b0]] \n\t"
"vld1.s8 {d0}, [%[b0]]
\n\t
"
"vld1.s8 {d0}, [%[b0]]
\n\t
"
...
@@ -546,6 +691,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
...
@@ -546,6 +691,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
:
[
local_buffer
]
"+r"
(
local_buffer
)
:
[
local_buffer
]
"+r"
(
local_buffer
)
:
[
b0
]
"r"
(
b0
)
:
[
b0
]
"r"
(
b0
)
:
"memory"
,
"q0"
);
:
"memory"
,
"q0"
);
#endif // __aarch64__
#else
#else
*
local_buffer
++
=
*
b0
++
;
*
local_buffer
++
=
*
b0
++
;
*
local_buffer
++
=
*
b0
++
;
*
local_buffer
++
=
*
b0
++
;
...
@@ -585,13 +731,13 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
...
@@ -585,13 +731,13 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
MC
=
L1
/
(
KC
*
sizeof
(
int8_t
));
MC
=
L1
/
(
KC
*
sizeof
(
int8_t
));
NC
=
L2
/
(
KC
*
sizeof
(
int8_t
));
NC
=
L2
/
(
KC
*
sizeof
(
int8_t
));
// make sure MC is multiple of MR, and NC is multiple of NR
// make sure MC is multiple of MR
_INT8
, and NC is multiple of NR
if
(
MC
==
0
)
{
if
(
MC
==
0
)
{
MC
=
MR
;
MC
=
MR
_INT8
;
}
else
{
}
else
{
int32_t
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
int32_t
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
MC
=
(
MC
+
MR
_INT8
-
1
)
/
MR_INT8
*
MR_INT8
;
}
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if
(
NC
==
0
)
{
if
(
NC
==
0
)
{
...
@@ -618,7 +764,8 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
...
@@ -618,7 +764,8 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB_int8
);
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB_int8
);
for
(
int32_t
i
=
0
;
i
<
m
;
i
+=
MC
)
{
for
(
int32_t
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
mc
=
s_min
(
m
-
i
,
MC
);
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA_int8
);
// PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
PackMatrixA_4r
(
mc
,
KC
,
mc
%
MR_INT8
,
&
A
(
i
,
0
),
lda
,
packedA_int8
);
if
(
bias
==
nullptr
)
{
if
(
bias
==
nullptr
)
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int8
,
&
C
(
i
,
j
),
ldc
,
relu
,
nullptr
);
packedC_int8
,
&
C
(
i
,
j
),
ldc
,
relu
,
nullptr
);
...
@@ -643,6 +790,9 @@ void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
...
@@ -643,6 +790,9 @@ void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
void
Gemm
::
WriteBasic
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int32_t
*
C
,
void
Gemm
::
WriteBasic
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int32_t
*
C
,
int32_t
ldc
)
{
int32_t
ldc
)
{
#if __ARM_NEON
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int32_t
nc1
=
nc
>>
4
;
int32_t
nc1
=
nc
>>
4
;
int32_t
_nc1
=
nc
&
15
;
int32_t
_nc1
=
nc
&
15
;
int32_t
step
=
sizeof
(
int32_t
)
*
ldc
;
int32_t
step
=
sizeof
(
int32_t
)
*
ldc
;
...
@@ -696,6 +846,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
...
@@ -696,6 +846,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
}
}
}
}
}
}
#endif // __aarch64__
#endif // __ARM_NEON
#endif // __ARM_NEON
}
}
...
...
src/operators/op_param.h
浏览文件 @
f830e839
...
@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam {
...
@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam {
};
};
#endif
#endif
#ifdef POLYGONBOXTRANSFORM_OP
template
<
typename
Dtype
>
class
PolygonBoxTransformParam
:
public
OpParam
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
rtype
RType
;
public:
PolygonBoxTransformParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
input_
=
InputFrom
<
GType
>
(
inputs
,
scope
);
output_
=
OutputFrom
<
GType
>
(
outputs
,
scope
);
}
const
RType
*
Input
()
const
{
return
input_
;
}
RType
*
Output
()
const
{
return
output_
;
}
private:
RType
*
input_
;
RType
*
output_
;
};
#endif
template
<
typename
Dtype
>
template
<
typename
Dtype
>
class
FeedParam
:
public
OpParam
{
class
FeedParam
:
public
OpParam
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
...
@@ -2272,6 +2294,7 @@ class ShapeParam : public OpParam {
...
@@ -2272,6 +2294,7 @@ class ShapeParam : public OpParam {
};
};
#endif
#endif
#ifdef QUANT_OP
template
<
typename
Dtype
>
template
<
typename
Dtype
>
class
QuantizeParam
:
public
OpParam
{
class
QuantizeParam
:
public
OpParam
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
...
@@ -2311,7 +2334,9 @@ class QuantizeParam : public OpParam {
...
@@ -2311,7 +2334,9 @@ class QuantizeParam : public OpParam {
// nearest_zero and nearest_even is valid currently
// nearest_zero and nearest_even is valid currently
RoundType
round_type_
=
ROUND_NEAREST_AWAY_ZERO
;
RoundType
round_type_
=
ROUND_NEAREST_AWAY_ZERO
;
};
};
#endif
#ifdef DEQUANT_OP
template
<
typename
Dtype
>
template
<
typename
Dtype
>
class
DequantizeParam
:
public
OpParam
{
class
DequantizeParam
:
public
OpParam
{
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
Dtype
>::
gtype
GType
;
...
@@ -2339,6 +2364,7 @@ class DequantizeParam : public OpParam {
...
@@ -2339,6 +2364,7 @@ class DequantizeParam : public OpParam {
RType
*
activation_scale_
;
RType
*
activation_scale_
;
float
weight_scale_
;
float
weight_scale_
;
};
};
#endif
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/operators/polygon_box_transform_op.cpp
0 → 100644
浏览文件 @
f830e839
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#include "operators/polygon_box_transform_op.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
PolygonBoxTransformOp
<
Dtype
,
T
>::
InferShape
()
const
{
PADDLE_MOBILE_ENFORCE
(
this
->
param_
.
Input
()
!=
nullptr
,
"Input (Input) of get_shape op should not be null."
);
PADDLE_MOBILE_ENFORCE
(
this
->
param_
.
Output
()
!=
nullptr
,
"Output (Output) of get_shape op should not be null."
);
auto
input_dims
=
this
->
param_
.
Input
()
->
dims
();
PADDLE_MOBILE_ENFORCE
(
input_dims
.
size
()
==
4
,
"input's rank must be 4."
);
PADDLE_MOBILE_ENFORCE
(
input_dims
[
1
]
%
2
==
0
,
"input's second dimension must be even."
);
this
->
param_
.
Output
()
->
Resize
(
input_dims
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
polygon_box_transform
,
ops
::
PolygonBoxTransformOp
);
#endif
#endif
src/operators/polygon_box_transform_op.h
0 → 100644
浏览文件 @
f830e839
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/polygon_box_transform_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
paddle_mobile
::
framework
::
Tensor
;
template
<
typename
DeviceType
,
typename
T
>
class
PolygonBoxTransformOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
PolygonBoxTransformParam
<
DeviceType
>
,
operators
::
PolygonBoxTransformKernel
<
DeviceType
,
T
>>
{
public:
PolygonBoxTransformOp
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
PolygonBoxTransformParam
<
DeviceType
>
,
operators
::
PolygonBoxTransformKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
using
framework
::
OperatorWithKernel
<
DeviceType
,
PolygonBoxTransformParam
<
DeviceType
>
,
operators
::
PolygonBoxTransformKernel
<
DeviceType
,
T
>>::
OperatorWithKernel
;
void
InferShape
()
const
override
;
protected:
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
test/CMakeLists.txt
浏览文件 @
f830e839
...
@@ -181,6 +181,10 @@ if (NOT FOUND_MATCH)
...
@@ -181,6 +181,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE
(
test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h
)
ADD_EXECUTABLE
(
test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-multiclassnms-op paddle-mobile
)
target_link_libraries
(
test-multiclassnms-op paddle-mobile
)
# gen test
ADD_EXECUTABLE
(
test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-polygon-box-transform-op paddle-mobile
)
# gen test
# gen test
ADD_EXECUTABLE
(
test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h
)
ADD_EXECUTABLE
(
test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h
)
target_link_libraries
(
test-reshape-op paddle-mobile
)
target_link_libraries
(
test-reshape-op paddle-mobile
)
...
...
test/operators/test_mul_op.cpp
浏览文件 @
f830e839
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h"
#include "../test_helper.h"
#include "../test_include.h"
#include "../test_include.h"
#include "operators/mul_op.h"
#include "operators/mul_op.h"
...
...
test/operators/test_polygon_box_transform_op.cpp
0 → 100644
浏览文件 @
f830e839
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/polygon_box_transform_op.h"
namespace
paddle_mobile
{
namespace
framework
{
template
<
typename
Dtype
>
class
TestPolygonBoxTransformOp
{
public:
explicit
TestPolygonBoxTransformOp
(
const
Program
<
Dtype
>
p
)
:
program_
(
p
)
{
if
(
use_optimize_
)
{
to_predict_program_
=
program_
.
optimizeProgram
;
}
else
{
to_predict_program_
=
program_
.
originProgram
;
}
const
std
::
vector
<
std
::
shared_ptr
<
BlockDesc
>>
blocks
=
to_predict_program_
->
Blocks
();
for
(
auto
block_desc
:
blocks
)
{
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
for
(
auto
op
:
ops
)
{
if
(
op
->
Type
()
==
"polygon_box_transform"
)
{
DLOG
<<
" attr size: "
<<
op
->
GetAttrMap
().
size
();
DLOG
<<
" inputs size: "
<<
op
->
GetInputs
().
size
();
DLOG
<<
" input is : "
<<
op
->
Input
(
"Input"
)[
0
];
input_var_name
=
op
->
Input
(
"Input"
)[
0
];
DLOG
<<
" outputs size: "
<<
op
->
GetOutputs
().
size
();
DLOG
<<
" output is : "
<<
op
->
Output
(
"Output"
)[
0
];
output_var_name
=
op
->
Output
(
"Output"
)[
0
];
std
::
shared_ptr
<
operators
::
PolygonBoxTransformOp
<
Dtype
,
float
>>
op_ptr
=
std
::
make_shared
<
operators
::
PolygonBoxTransformOp
<
Dtype
,
float
>>
(
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
GetAttrMap
(),
program_
.
scope
);
ops_of_block_
[
*
block_desc
.
get
()].
push_back
(
op_ptr
);
return
;
}
}
}
}
std
::
shared_ptr
<
Tensor
>
predict
(
const
Tensor
&
t
)
{
auto
scope
=
program_
.
scope
;
Variable
*
input_feed_value
=
scope
->
Var
(
input_var_name
);
auto
tensor_input
=
input_feed_value
->
GetMutable
<
LoDTensor
>
();
tensor_input
->
ShareDataWith
(
t
);
Variable
*
output
=
scope
->
Var
(
output_var_name
);
auto
*
output_tensor
=
output
->
GetMutable
<
LoDTensor
>
();
std
::
shared_ptr
<
Tensor
>
out_tensor
=
std
::
make_shared
<
LoDTensor
>
();
out_tensor
.
reset
(
output_tensor
);
predict
(
t
,
0
);
return
out_tensor
;
}
private:
const
framework
::
Program
<
Dtype
>
program_
;
std
::
shared_ptr
<
ProgramDesc
>
to_predict_program_
;
std
::
map
<
framework
::
BlockDesc
,
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
<
Dtype
>>>>
ops_of_block_
;
bool
use_optimize_
=
false
;
string
input_var_name
;
string
output_var_name
;
void
predict
(
const
Tensor
&
t
,
int
block_id
)
{
std
::
shared_ptr
<
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
block_id
);
for
(
int
j
=
0
;
j
<
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
++
j
)
{
auto
op
=
ops_of_block_
[
*
to_predict_block
.
get
()][
j
];
op
->
Run
();
}
}
};
template
class
TestPolygonBoxTransformOp
<
CPU
>;
}
// namespace framework
}
// namespace paddle_mobile
int
main
()
{
DLOG
<<
"----------**********----------"
;
DLOG
<<
"begin to run PolygonBoxTransform Test"
;
paddle_mobile
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
auto
program
=
loader
.
Load
(
std
::
string
(
g_ocr
));
paddle_mobile
::
framework
::
Tensor
input
;
SetupTensor
<
float
>
(
&
input
,
{
1
,
8
,
1
,
2
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
auto
*
input_ptr
=
input
.
data
<
float
>
();
for
(
int
i
=
0
;
i
<
16
;
++
i
)
{
*
(
input_ptr
+
i
)
=
i
;
}
DLOG
<<
"input : "
;
for
(
int
i
=
0
;
i
<
input
.
numel
();
++
i
)
{
DLOG
<<
" index "
<<
i
<<
" : "
<<
input_ptr
[
i
];
}
paddle_mobile
::
framework
::
TestPolygonBoxTransformOp
<
paddle_mobile
::
CPU
>
testPolygonBoxTransformOp
(
program
);
auto
output
=
testPolygonBoxTransformOp
.
predict
(
input
);
auto
*
output_ptr
=
output
->
data
<
float
>
();
DLOG
<<
"output : "
;
for
(
int
i
=
0
;
i
<
output
->
numel
();
++
i
)
{
DLOG
<<
" index "
<<
i
<<
" : "
<<
output_ptr
[
i
];
}
return
0
;
}
tools/op.cmake
浏览文件 @
f830e839
...
@@ -195,6 +195,7 @@ if(NOT FOUND_MATCH)
...
@@ -195,6 +195,7 @@ if(NOT FOUND_MATCH)
set
(
LRN_OP ON
)
set
(
LRN_OP ON
)
set
(
MUL_OP ON
)
set
(
MUL_OP ON
)
set
(
MULTICLASSNMS_OP ON
)
set
(
MULTICLASSNMS_OP ON
)
set
(
POLYGONBOXTRANSFORM_OP ON
)
set
(
POOL_OP ON
)
set
(
POOL_OP ON
)
set
(
PRIORBOX_OP ON
)
set
(
PRIORBOX_OP ON
)
set
(
RELU_OP ON
)
set
(
RELU_OP ON
)
...
@@ -238,6 +239,7 @@ endif()
...
@@ -238,6 +239,7 @@ endif()
# option(LRN_OP "" ON)
# option(LRN_OP "" ON)
# option(MUL_OP "" ON)
# option(MUL_OP "" ON)
# option(MULTICLASSNMS_OP "" ON)
# option(MULTICLASSNMS_OP "" ON)
# option(POLYGONBOXTRANSFORM_OP "" ON)
# option(POOL_OP "" ON)
# option(POOL_OP "" ON)
# option(PRIORBOX_OP "" ON)
# option(PRIORBOX_OP "" ON)
# option(RELU_OP "" ON)
# option(RELU_OP "" ON)
...
@@ -292,6 +294,9 @@ endif()
...
@@ -292,6 +294,9 @@ endif()
if
(
MULTICLASSNMS_OP
)
if
(
MULTICLASSNMS_OP
)
add_definitions
(
-DMULTICLASSNMS_OP
)
add_definitions
(
-DMULTICLASSNMS_OP
)
endif
()
endif
()
if
(
POLYGONBOXTRANSFORM_OP
)
add_definitions
(
-DPOLYGONBOXTRANSFORM_OP
)
endif
()
if
(
POOL_OP
)
if
(
POOL_OP
)
add_definitions
(
-DPOOL_OP
)
add_definitions
(
-DPOOL_OP
)
endif
()
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录