提交 f830e839 编写于 作者: H hjchen2

Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into dev-latest

...@@ -34,6 +34,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; ...@@ -34,6 +34,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn"; const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul"; const char *G_OP_TYPE_MUL = "mul";
const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms"; const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
const char *G_OP_TYPE_POOL2D = "pool2d"; const char *G_OP_TYPE_POOL2D = "pool2d";
const char *G_OP_TYPE_PRIOR_BOX = "prior_box"; const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
const char *G_OP_TYPE_RELU = "relu"; const char *G_OP_TYPE_RELU = "relu";
...@@ -94,6 +95,7 @@ std::unordered_map< ...@@ -94,6 +95,7 @@ std::unordered_map<
{G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}}, {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}}, {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
......
...@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
if (num_per_div_before_alignment == num_per_div_after_alignment) {
return;
}
int num_element = int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale 2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned = float *ptr_aligned =
......
...@@ -199,6 +199,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA); ...@@ -199,6 +199,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
#ifdef MULTICLASSNMS_OP #ifdef MULTICLASSNMS_OP
LOAD_OP1(multiclass_nms, CPU); LOAD_OP1(multiclass_nms, CPU);
#endif #endif
#ifdef POLYGONBOXTRANSFORM_OP
LOAD_OP1(polygon_box_transform, CPU);
#endif
#ifdef SUM_OP #ifdef SUM_OP
LOAD_OP1(sum, CPU); LOAD_OP1(sum, CPU);
#endif #endif
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#include "operators/kernel/polygon_box_transform_kernel.h"
#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool PolygonBoxTransformKernel<CPU, float>::Init(
PolygonBoxTransformParam<CPU> *param) {
return true;
}
template <>
void PolygonBoxTransformKernel<CPU, float>::Compute(
const PolygonBoxTransformParam<CPU> &param) const {
PolygonBoxTransformCompute<float>(param);
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
const auto* input = param.Input();
const auto& input_dims = input->dims();
const auto* input_data = input->data<float>();
auto* output = param.Output();
auto* output_data = output->mutable_data<float>();
int64_t batch_size = input_dims[0];
int64_t geo_channel = input_dims[1];
int64_t height = input_dims[2];
int64_t width = input_dims[3];
int64_t id = 0;
for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
for (int64_t id_h = 0; id_h < height; ++id_h) {
for (int64_t id_w = 0; id_w < width; ++id_w) {
id = id_n * height * width + width * id_h + id_w;
if (id_n % 2 == 0) {
output_data[id] = id_w * 4 - input_data[id];
} else {
output_data[id] = id_h * 4 - input_data[id];
}
}
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class PolygonBoxTransformKernel
: public framework::OpKernelBase<DeviceType,
PolygonBoxTransformParam<DeviceType>> {
public:
void Compute(const PolygonBoxTransformParam<DeviceType>& param) const;
bool Init(PolygonBoxTransformParam<DeviceType>* param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, ...@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
// 对 B 分块 // 对 B 分块
NC = L1 / (KC * sizeof(float)); NC = L1 / (KC * sizeof(float));
if (NC == 0) { if (NC == 0) {
NC == NR; NC = NR;
} else { } else {
int nblock_num = (n + NC - 1) / NC; int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num; NC = (n + nblock_num - 1) / nblock_num;
......
...@@ -22,9 +22,11 @@ limitations under the License. */ ...@@ -22,9 +22,11 @@ limitations under the License. */
#define C(i, j) C[(i)*ldc + (j)] #define C(i, j) C[(i)*ldc + (j)]
#if __aarch64__ #if __aarch64__
#define MR_INT8 4
#define MR 6 #define MR 6
#define NR 16 #define NR 16
#else #else
#define MR_INT8 4
#define MR 6 #define MR 6
#define NR 8 #define NR 8
#endif #endif
...@@ -189,6 +191,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -189,6 +191,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
// 8 bits function cluster begins // 8 bits function cluster begins
// 8 bits int small block inner product // 8 bits int small block inner product
void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc); int32_t ldc);
...@@ -199,6 +203,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -199,6 +203,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
int8_t *bias); int8_t *bias);
// 8 bits int pack function // 8 bits int pack function
void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer); int32_t lda, int8_t *buffer);
void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
......
...@@ -26,11 +26,228 @@ limitations under the License. */ ...@@ -26,11 +26,228 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) {
#if __ARM_NEON
#if __aarch64__
// TODO
#else
const int8_t *a_ptr, *b_ptr;
a_ptr = a;
b_ptr = b;
int32_t kc1 = k >> 3;
int32_t kc2 = k & 7;
int32_t kc3 = kc2 >> 2;
int32_t kc4 = kc2 & 3;
int32_t kc5 = kc4 >> 1;
int32_t kc6 = kc4 & 1;
int32_t step = sizeof(int32_t) * ldc;
asm volatile(
// q8-q15: save 32 results
"pld [%[a_ptr]] \n\t"
"pld [%[b_ptr]] \n\t"
"pld [%[b_ptr], #64] \n\t"
"vmov.s32 q8, #0 \n\t"
"vmov.s32 q9, q8 \n\t"
"vmov.s32 q10, q8 \n\t"
"vmov.s32 q11, q8 \n\t"
"vmov.s32 q12, q8 \n\t"
"vmov.s32 q13, q8 \n\t"
"vmov.s32 q14, q8 \n\t"
"vmov.s32 q15, q8 \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt 1f \n\t"
"0: \n\t"
"pld [%[a_ptr], #64] \n\t"
"pld [%[b_ptr], #128] \n\t"
"vld1.s8 {d0-d3}, [%[a_ptr]]! \n\t" // load A 8 cols
"vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B first 4 rows
"vmovl.s8 q2, d0 \n\t" // process B first 4
// rows
"vmovl.s8 q3, d8 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"vmovl.s8 q3, d9 \n\t"
"vmlal.s16 q8, d6, d5[0]\n\t"
"vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s16 q13, d7, d5[2]\n\t"
"vmlal.s16 q14, d6, d5[3]\n\t"
"vmlal.s16 q15, d7, d5[3]\n\t"
"vld1.s8 {d12-d15}, [%[b_ptr]]! \n\t" // load B second 4
// rows
"vmovl.s8 q2, d1 \n\t"
"vmovl.s8 q3, d10 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"vmovl.s8 q3, d11 \n\t"
"vmlal.s16 q8, d6, d5[0]\n\t"
"vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s16 q13, d7, d5[2]\n\t"
"vmlal.s16 q14, d6, d5[3]\n\t"
"vmlal.s16 q15, d7, d5[3]\n\t"
"vmovl.s8 q2, d2 \n\t" // process B second 4
// rows
"vmovl.s8 q3, d12 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"vmovl.s8 q3, d13 \n\t"
"vmlal.s16 q8, d6, d5[0]\n\t"
"vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s16 q13, d7, d5[2]\n\t"
"vmlal.s16 q14, d6, d5[3]\n\t"
"vmlal.s16 q15, d7, d5[3]\n\t"
"vmovl.s8 q2, d3 \n\t"
"vmovl.s8 q3, d14 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"vmovl.s8 q3, d15 \n\t"
"vmlal.s16 q8, d6, d5[0]\n\t"
"vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s16 q13, d7, d5[2]\n\t"
"vmlal.s16 q14, d6, d5[3]\n\t"
"vmlal.s16 q15, d7, d5[3]\n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge 0b \n\t"
"1: \n\t" // last 4 rows
"subs %[kc3], %[kc3], #1 \n\t"
"blt 2f \n\t"
"vld1.s8 {d0-d1}, [%[a_ptr]]! \n\t" // load A 4 cols
"vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B 4 rows
"vmovl.s8 q2, d0 \n\t"
"vmovl.s8 q3, d8 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"vmovl.s8 q3, d9 \n\t"
"vmlal.s16 q8, d6, d5[0]\n\t"
"vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s16 q13, d7, d5[2]\n\t"
"vmlal.s16 q14, d6, d5[3]\n\t"
"vmlal.s16 q15, d7, d5[3]\n\t"
"vmovl.s8 q2, d1 \n\t"
"vmovl.s8 q3, d10 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"vmovl.s8 q3, d11 \n\t"
"vmlal.s16 q8, d6, d5[0]\n\t"
"vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s16 q13, d7, d5[2]\n\t"
"vmlal.s16 q14, d6, d5[3]\n\t"
"vmlal.s16 q15, d7, d5[3]\n\t"
"2: \n\t" // last 2 rows
"subs %[kc5], %[kc5], #1 \n\t"
"blt 3f \n\t"
"vld1.s8 {d0}, [%[a_ptr]]! \n\t" // load A 2 cols
"vld1.s8 {d8-d9}, [%[b_ptr]]! \n\t" // load B 2 rows
"vmovl.s8 q2, d0 \n\t"
"vmovl.s8 q3, d8 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"vmovl.s8 q3, d9 \n\t"
"vmlal.s16 q8, d6, d5[0]\n\t"
"vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s16 q13, d7, d5[2]\n\t"
"vmlal.s16 q14, d6, d5[3]\n\t"
"vmlal.s16 q15, d7, d5[3]\n\t"
"3: \n\t" // last 1 row
"subs %[kc6], %[kc6], #1 \n\t"
"blt 4f \n\t"
"vld1.s8 {d0}, [%[a_ptr]] \n\t" // load A 1 col
"vld1.s8 {d8}, [%[b_ptr]] \n\t" // load B 1 row
"vmovl.s8 q2, d0 \n\t"
"vmovl.s8 q3, d8 \n\t"
"vmlal.s16 q8, d6, d4[0]\n\t"
"vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s16 q15, d7, d4[3]\n\t"
"4: \n\t"
"vst1.32 {q8, q9}, [%[c]], %[step] \n\t"
"vst1.32 {q10, q11}, [%[c]], %[step] \n\t"
"vst1.32 {q12, q13}, [%[c]], %[step] \n\t"
"vst1.32 {q14, q15}, [%[c]] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__
#endif // __ARM_NEON
}
// 8 bits int small block inner product // 8 bits int small block inner product
void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc) { int32_t ldc) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__
// TODO
#else
const int8_t *a_ptr, *b_ptr; const int8_t *a_ptr, *b_ptr;
a_ptr = a; a_ptr = a;
b_ptr = b; b_ptr = b;
...@@ -46,383 +263,265 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -46,383 +263,265 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"pld [%[a_ptr]] \n\t" "pld [%[a_ptr]] \n\t"
"pld [%[b_ptr]] \n\t" "pld [%[b_ptr]] \n\t"
"pld [%[b_ptr], #64] \n\t" "pld [%[b_ptr], #64] \n\t"
"vmov.s8 q4, #0 \n\t" "vmov.s32 q4, #0 \n\t"
"vmov.s8 q5, #0 \n\t" "vmov.s32 q5, q4 \n\t"
"vmov.s8 q6, #0 \n\t" "vmov.s32 q6, q4 \n\t"
"vmov.s8 q7, #0 \n\t" "vmov.s32 q7, q4 \n\t"
"vmov.s8 q8, #0 \n\t" "vmov.s32 q8, q4 \n\t"
"vmov.s8 q9, #0 \n\t" "vmov.s32 q9, q4 \n\t"
"vmov.s8 q10, #0 \n\t" "vmov.s32 q10, q4 \n\t"
"vmov.s8 q11, #0 \n\t" "vmov.s32 q11, q4 \n\t"
"vmov.s8 q12, #0 \n\t" "vmov.s32 q12, q4 \n\t"
"vmov.s8 q13, #0 \n\t" "vmov.s32 q13, q4 \n\t"
"vmov.s8 q14, #0 \n\t" "vmov.s32 q14, q4 \n\t"
"vmov.s8 q15, #0 \n\t" "vmov.s32 q15, q4 \n\t"
"mov r0, #12 \n\t" "mov r0, #12 \n\t"
"subs %[kc1], %[kc1], #1 \n\t" "subs %[kc1], %[kc1], #1 \n\t"
"blt 1f \n\t" "blt 1f \n\t"
"0: \n\t" "0: \n\t"
"pld [%[a_ptr], #64] \n\t" "pld [%[a_ptr], #64] \n\t"
"pld [%[b_ptr], #128] \n\t" "pld [%[b_ptr], #128] \n\t"
"vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols, q0 used, "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols
// 1/2 q3 used "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row
"vmov.s8 q2, #0 \n\t" // q2 used "vmovl.s8 q2, d0 \n\t"
"vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1, "vmovl.s8 q3, d3 \n\t"
// q1 "vmlal.s16 q4, d6, d4[0]\n\t"
"vdup.s8 d3, d0[0] \n\t" // q3 used // used "vmlal.s16 q5, d7, d4[0]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0 "vmlal.s16 q6, d6, d4[1]\n\t"
"vdup.s8 d3, d0[6] \n\t" // q3 used "vmlal.s16 q7, d7, d4[1]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1, "vmlal.s16 q8, d6, d4[2]\n\t"
// q3 free "vmlal.s16 q9, d7, d4[2]\n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s16 q10, d6, d4[3]\n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s16 q11, d7, d4[3]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q12, d6, d5[0]\n\t"
"vdup.s8 d3, d0[1] \n\t" "vmlal.s16 q13, d7, d5[0]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q14, d6, d5[1]\n\t"
"vdup.s8 d3, d0[7] \n\t" "vmlal.s16 q15, d7, d5[1]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row
"vaddw.s16 q6, q6, d4 \n\t" "vmovl.s8 q3, d3 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s16 q4, d6, d5[2]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q5, d7, d5[2]\n\t"
"vdup.s8 d3, d0[2] \n\t" "vmlal.s16 q6, d6, d5[3]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q7, d7, d5[3]\n\t"
"vdup.s8 d3, d1[0] \n\t" "vmovl.s8 q2, d1 \n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q8, d6, d4[0]\n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s16 q9, d7, d4[0]\n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmlal.s16 q10, d6, d4[1]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q11, d7, d4[1]\n\t"
"vdup.s8 d3, d0[3] \n\t" "vmlal.s16 q12, d6, d4[2]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q13, d7, d4[2]\n\t"
"vdup.s8 d3, d1[1] \n\t" "vmlal.s16 q14, d6, d4[3]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q15, d7, d4[3]\n\t"
"vaddw.s16 q10, q10, d4 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 3th row
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vmovl.s8 q3, d3 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q4, d6, d5[0]\n\t"
"vdup.s8 d3, d0[4] \n\t" "vmlal.s16 q5, d7, d5[0]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q6, d6, d5[1]\n\t"
"vdup.s8 d3, d1[2] \n\t" "vmlal.s16 q7, d7, d5[1]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q8, d6, d5[2]\n\t"
"vaddw.s16 q12, q12, d4 \n\t" "vmlal.s16 q9, d7, d5[2]\n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vmlal.s16 q10, d6, d5[3]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q11, d7, d5[3]\n\t"
"vdup.s8 d3, d0[5] \n\t" "vmovl.s8 q2, d2 \n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q12, d6, d4[0]\n\t"
"vdup.s8 d3, d1[3] \n\t" "vmlal.s16 q13, d7, d4[0]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q14, d6, d4[1]\n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vmlal.s16 q15, d7, d4[1]\n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 4th row
"vmovl.s8 q3, d3 \n\t"
"vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1, "vmlal.s16 q4, d6, d4[2]\n\t"
// q1 "vmlal.s16 q5, d7, d4[2]\n\t"
"vmov.s8 q2, #0 \n\t" // q2 used "vmlal.s16 q6, d6, d4[3]\n\t"
"vdup.s8 d3, d1[4] \n\t" // q3 used // used "vmlal.s16 q7, d7, d4[3]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0 "vmlal.s16 q8, d6, d5[0]\n\t"
"vdup.s8 d3, d2[2] \n\t" // q3 used "vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1, "vmlal.s16 q10, d6, d5[1]\n\t"
// q3 free "vmlal.s16 q11, d7, d5[1]\n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s16 q12, d6, d5[2]\n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s16 q13, d7, d5[2]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q14, d6, d5[3]\n\t"
"vdup.s8 d3, d1[5] \n\t" "vmlal.s16 q15, d7, d5[3]\n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[3] \n\t" "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols
"vmlal.s8 q2, d7, d3 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row
"vaddw.s16 q6, q6, d4 \n\t" "vmovl.s8 q2, d0 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmovl.s8 q3, d3 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q4, d6, d4[0]\n\t"
"vdup.s8 d3, d1[6] \n\t" "vmlal.s16 q5, d7, d4[0]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q6, d6, d4[1]\n\t"
"vdup.s8 d3, d2[4] \n\t" "vmlal.s16 q7, d7, d4[1]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q8, d6, d4[2]\n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s16 q9, d7, d4[2]\n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmlal.s16 q10, d6, d4[3]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q11, d7, d4[3]\n\t"
"vdup.s8 d3, d1[7] \n\t" "vmlal.s16 q12, d6, d5[0]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q13, d7, d5[0]\n\t"
"vdup.s8 d3, d2[5] \n\t" "vmlal.s16 q14, d6, d5[1]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q15, d7, d5[1]\n\t"
"vaddw.s16 q10, q10, d4 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vmovl.s8 q3, d3 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q4, d6, d5[2]\n\t"
"vdup.s8 d3, d2[0] \n\t" "vmlal.s16 q5, d7, d5[2]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q6, d6, d5[3]\n\t"
"vdup.s8 d3, d2[6] \n\t" "vmlal.s16 q7, d7, d5[3]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmovl.s8 q2, d1 \n\t"
"vaddw.s16 q12, q12, d4 \n\t" "vmlal.s16 q8, d6, d4[0]\n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vmlal.s16 q9, d7, d4[0]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q10, d6, d4[1]\n\t"
"vdup.s8 d3, d2[1] \n\t" "vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q12, d6, d4[2]\n\t"
"vdup.s8 d3, d2[7] \n\t" "vmlal.s16 q13, d7, d4[2]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q14, d6, d4[3]\n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vmlal.s16 q15, d7, d4[3]\n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 3th row
"vmovl.s8 q3, d3 \n\t"
"vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols, q0 used, "vmlal.s16 q4, d6, d5[0]\n\t"
// 1/2 q3 used "vmlal.s16 q5, d7, d5[0]\n\t"
"vmov.s8 q2, #0 \n\t" // q2 used "vmlal.s16 q6, d6, d5[1]\n\t"
"vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1, "vmlal.s16 q7, d7, d5[1]\n\t"
// q1 "vmlal.s16 q8, d6, d5[2]\n\t"
"vdup.s8 d3, d0[0] \n\t" // q3 used // used "vmlal.s16 q9, d7, d5[2]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0 "vmlal.s16 q10, d6, d5[3]\n\t"
"vdup.s8 d3, d0[6] \n\t" // q3 used "vmlal.s16 q11, d7, d5[3]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1, "vmovl.s8 q2, d2 \n\t"
// q3 free "vmlal.s16 q12, d6, d4[0]\n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s16 q13, d7, d4[0]\n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s16 q14, d6, d4[1]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q15, d7, d4[1]\n\t"
"vdup.s8 d3, d0[1] \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 4th row
"vmlal.s8 q2, d6, d3 \n\t" "vmovl.s8 q3, d3 \n\t"
"vdup.s8 d3, d0[7] \n\t" "vmlal.s16 q4, d6, d4[2]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q5, d7, d4[2]\n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vmlal.s16 q6, d6, d4[3]\n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s16 q7, d7, d4[3]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q8, d6, d5[0]\n\t"
"vdup.s8 d3, d0[2] \n\t" "vmlal.s16 q9, d7, d5[0]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q10, d6, d5[1]\n\t"
"vdup.s8 d3, d1[0] \n\t" "vmlal.s16 q11, d7, d5[1]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q12, d6, d5[2]\n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s16 q13, d7, d5[2]\n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmlal.s16 q14, d6, d5[3]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q15, d7, d5[3]\n\t"
"vdup.s8 d3, d0[3] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d1[1] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d0[4] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d1[2] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d0[5] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d1[3] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1,
// q1
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d3, d1[4] \n\t" // q3 used // used
"vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0
"vdup.s8 d3, d2[2] \n\t" // q3 used
"vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1,
// q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d1[5] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[3] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q6, q6, d4 \n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d1[6] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[4] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d1[7] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[5] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d2[0] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[6] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d2[1] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[7] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"subs %[kc1], %[kc1], #1 \n\t" "subs %[kc1], %[kc1], #1 \n\t"
"bge 0b \n\t" "bge 0b \n\t"
"1: \n\t" // last <8 rows "1: \n\t" // last <8 rows
"subs %[kc3], %[kc3], #1 \n\t" "subs %[kc3], %[kc3], #1 \n\t"
"blt 2f \n\t" "blt 2f \n\t"
"vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols
"vmov.s8 q2, #0 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row
"vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" "vmovl.s8 q2, d0 \n\t"
"vdup.s8 d3, d0[0] \n\t" "vmovl.s8 q3, d3 \n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q4, d6, d4[0]\n\t"
"vdup.s8 d3, d0[6] \n\t" "vmlal.s16 q5, d7, d4[0]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q6, d6, d4[1]\n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s16 q7, d7, d4[1]\n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s16 q8, d6, d4[2]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q9, d7, d4[2]\n\t"
"vdup.s8 d3, d0[1] \n\t" "vmlal.s16 q10, d6, d4[3]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q11, d7, d4[3]\n\t"
"vdup.s8 d3, d0[7] \n\t" "vmlal.s16 q12, d6, d5[0]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q13, d7, d5[0]\n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vmlal.s16 q14, d6, d5[1]\n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s16 q15, d7, d5[1]\n\t"
"vmov.s8 q2, #0 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row
"vdup.s8 d3, d0[2] \n\t" "vmovl.s8 q3, d3 \n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q4, d6, d5[2]\n\t"
"vdup.s8 d3, d1[0] \n\t" "vmlal.s16 q5, d7, d5[2]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q6, d6, d5[3]\n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s16 q7, d7, d5[3]\n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmovl.s8 q2, d1 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q8, d6, d4[0]\n\t"
"vdup.s8 d3, d0[3] \n\t" "vmlal.s16 q9, d7, d4[0]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q10, d6, d4[1]\n\t"
"vdup.s8 d3, d1[1] \n\t" "vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q12, d6, d4[2]\n\t"
"vaddw.s16 q10, q10, d4 \n\t" "vmlal.s16 q13, d7, d4[2]\n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vmlal.s16 q14, d6, d4[3]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q15, d7, d4[3]\n\t"
"vdup.s8 d3, d0[4] \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 3th row
"vmlal.s8 q2, d6, d3 \n\t" "vmovl.s8 q3, d3 \n\t"
"vdup.s8 d3, d1[2] \n\t" "vmlal.s16 q4, d6, d5[0]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q5, d7, d5[0]\n\t"
"vaddw.s16 q12, q12, d4 \n\t" "vmlal.s16 q6, d6, d5[1]\n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4 "vmlal.s16 q7, d7, d5[1]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q8, d6, d5[2]\n\t"
"vdup.s8 d3, d0[5] \n\t" "vmlal.s16 q9, d7, d5[2]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q10, d6, d5[3]\n\t"
"vdup.s8 d3, d1[3] \n\t" "vmlal.s16 q11, d7, d5[3]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmovl.s8 q2, d2 \n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vmlal.s16 q12, d6, d4[0]\n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vmlal.s16 q13, d7, d4[0]\n\t"
"vmlal.s16 q14, d6, d4[1]\n\t"
"vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" "vmlal.s16 q15, d7, d4[1]\n\t"
"vmov.s8 q2, #0 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 4th row
"vdup.s8 d3, d1[4] \n\t" "vmovl.s8 q3, d3 \n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q4, d6, d4[2]\n\t"
"vdup.s8 d3, d2[2] \n\t" "vmlal.s16 q5, d7, d4[2]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q6, d6, d4[3]\n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s16 q7, d7, d4[3]\n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s16 q8, d6, d5[0]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q9, d7, d5[0]\n\t"
"vdup.s8 d3, d1[5] \n\t" "vmlal.s16 q10, d6, d5[1]\n\t"
"vmlal.s8 q2, d6, d3 \n\t" "vmlal.s16 q11, d7, d5[1]\n\t"
"vdup.s8 d3, d2[3] \n\t" "vmlal.s16 q12, d6, d5[2]\n\t"
"vmlal.s8 q2, d7, d3 \n\t" "vmlal.s16 q13, d7, d5[2]\n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vmlal.s16 q14, d6, d5[3]\n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s16 q15, d7, d5[3]\n\t"
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d1[6] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[4] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q8, q8, d4 \n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d1[7] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[5] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d2[0] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[6] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d3, d2[1] \n\t"
"vmlal.s8 q2, d6, d3 \n\t"
"vdup.s8 d3, d2[7] \n\t"
"vmlal.s8 q2, d7, d3 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"2: \n\t" // last <4 rows "2: \n\t" // last <4 rows
"subs %[kc5], %[kc5], #1 \n\t" "subs %[kc5], %[kc5], #1 \n\t"
"blt 3f \n\t" "blt 3f \n\t"
"vld1.s8 {d0, d1}, [%[a_ptr]], r0 \n\t" "vld1.s8 {d0, d1}, [%[a_ptr]], r0 \n\t"
"vmov.s8 q2, #0 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 1st row
"vdup.s8 d6, d0[0] \n\t" "vmovl.s8 q2, d0 \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" "vmovl.s8 q3, d3 \n\t"
"vdup.s8 d7, d0[6] \n\t" "vmlal.s16 q4, d6, d4[0]\n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s16 q5, d7, d4[0]\n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s16 q6, d6, d4[1]\n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s16 q7, d7, d4[1]\n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s16 q8, d6, d4[2]\n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q9, d7, d4[2]\n\t"
"vdup.s8 d6, d0[1] \n\t" "vmlal.s16 q10, d6, d4[3]\n\t"
"vdup.s8 d7, d0[7] \n\t" "vmlal.s16 q11, d7, d4[3]\n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s16 q12, d6, d5[0]\n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s16 q13, d7, d5[0]\n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vmlal.s16 q14, d6, d5[1]\n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s16 q15, d7, d5[1]\n\t"
"vmov.s8 q2, #0 \n\t" "vld1.s8 {d3}, [%[b_ptr]]! \n\t" // B 2nd row
"vdup.s8 d6, d0[2] \n\t" "vmovl.s8 q3, d3 \n\t"
"vdup.s8 d7, d1[0] \n\t" "vmlal.s16 q4, d6, d5[2]\n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s16 q5, d7, d5[2]\n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s16 q6, d6, d5[3]\n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s16 q7, d7, d5[3]\n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmovl.s8 q2, d1 \n\t"
"vmov.s8 q2, #0 \n\t" "vmlal.s16 q8, d6, d4[0]\n\t"
"vdup.s8 d6, d0[3] \n\t" "vmlal.s16 q9, d7, d4[0]\n\t"
"vdup.s8 d7, d1[1] \n\t" "vmlal.s16 q10, d6, d4[1]\n\t"
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s16 q11, d7, d4[1]\n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s16 q12, d6, d4[2]\n\t"
"vaddw.s16 q10, q10, d4 \n\t" "vmlal.s16 q13, d7, d4[2]\n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3 "vmlal.s16 q14, d6, d4[3]\n\t"
"vmov.s8 q2, #0. \n\t" "vmlal.s16 q15, d7, d4[3]\n\t"
"vdup.s8 d6, d0[4] \n\t"
"vdup.s8 d7, d1[2] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vmov.s8 q2, #0 \n\t"
"vdup.s8 d6, d0[5] \n\t"
"vdup.s8 d7, d1[3] \n\t"
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"3: \n\t" // last <2 rows "3: \n\t" // last <2 rows
"subs %[kc6], %[kc6], #1 \n\t" "subs %[kc6], %[kc6], #1 \n\t"
"blt 4f \n\t" "blt 4f \n\t"
"vld1.s8 {d0}, [%[a_ptr]] \n\t" "vld1.s8 {d0}, [%[a_ptr]] \n\t"
"vld1.s8 {d1}, [%[b_ptr]] \n\t" "vld1.s8 {d3}, [%[b_ptr]] \n\t"
"vdup.s8 d2, d0[0] \n\t" "vmovl.s8 q2, d0 \n\t"
"vmull.s8 q2, d1, d2 \n\t" "vmovl.s8 q3, d3 \n\t"
"vaddw.s16 q4, q4, d4 \n\t" "vmlal.s16 q4, d6, d4[0]\n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vmlal.s16 q5, d7, d4[0]\n\t"
"vdup.s8 d2, d0[1] \n\t" "vmlal.s16 q6, d6, d4[1]\n\t"
"vmull.s8 q2, d1, d2 \n\t" "vmlal.s16 q7, d7, d4[1]\n\t"
"vaddw.s16 q6, q6, d4 \n\t" "vmlal.s16 q8, d6, d4[2]\n\t"
"vaddw.s16 q7, q7, d5 \n\t" // res row 1 "vmlal.s16 q9, d7, d4[2]\n\t"
"vdup.s8 d2, d0[2] \n\t" "vmlal.s16 q10, d6, d4[3]\n\t"
"vmull.s8 q2, d1, d2 \n\t" "vmlal.s16 q11, d7, d4[3]\n\t"
"vaddw.s16 q8, q8, d4 \n\t" "vmlal.s16 q12, d6, d5[0]\n\t"
"vaddw.s16 q9, q9, d5 \n\t" // res row 2 "vmlal.s16 q13, d7, d5[0]\n\t"
"vdup.s8 d2, d0[3] \n\t" "vmlal.s16 q14, d6, d5[1]\n\t"
"vmull.s8 q2, d1, d2 \n\t" "vmlal.s16 q15, d7, d5[1]\n\t"
"vaddw.s16 q10, q10, d4 \n\t"
"vaddw.s16 q11, q11, d5 \n\t" // res row 3
"vdup.s8 d2, d0[4] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q12, q12, d4 \n\t"
"vaddw.s16 q13, q13, d5 \n\t" // res row 4
"vdup.s8 d2, d0[5] \n\t"
"vmull.s8 q2, d1, d2 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 4
"4: \n\t" "4: \n\t"
"vst1.32 {q4, q5}, [%[c]], %[step] \n\t" "vst1.32 {q4, q5}, [%[c]], %[step] \n\t"
"vst1.32 {q6, q7}, [%[c]], %[step] \n\t" "vst1.32 {q6, q7}, [%[c]], %[step] \n\t"
...@@ -435,7 +534,8 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -435,7 +534,8 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
[kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step) [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
: "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif #endif // __aarch64__
#endif // __ARM_NEON
} }
// 8 bits int inner product // 8 bits int inner product
...@@ -445,8 +545,9 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, ...@@ -445,8 +545,9 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
int8_t *bias) { int8_t *bias) {
#pragma omp parallel for #pragma omp parallel for
for (int32_t j = 0; j < nc; j += NR) { for (int32_t j = 0; j < nc; j += NR) {
for (int32_t i = 0; i < mc; i += MR) { for (int32_t i = 0; i < mc; i += MR_INT8) {
AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
} }
} }
if (alpha != 1) { if (alpha != 1) {
...@@ -474,12 +575,53 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, ...@@ -474,12 +575,53 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
return; return;
} }
} }
// 8 bits int PackMatrixA_4r
void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer) {
const int8_t *a0, *a1, *a2, *a3;
for (int32_t i = 0; i < m - m_tail; i += MR_INT8) {
a0 = A + i * lda;
a1 = A + (i + 1) * lda;
a2 = A + (i + 2) * lda;
a3 = A + (i + 3) * lda;
for (int32_t j = 0; j < k; ++j) {
*buffer++ = *a0++;
*buffer++ = *a1++;
*buffer++ = *a2++;
*buffer++ = *a3++;
}
}
// 8 bits int PackMatrixA if (m_tail != 0) {
a0 = &A(m - m_tail, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
switch (m_tail) {
case 1:
a1 = zero_int8;
case 2:
a2 = zero_int8;
case 3:
a3 = zero_int8;
break;
default:
break;
}
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
*buffer++ = *a1++;
*buffer++ = *a2++;
*buffer++ = *a3++;
}
}
}
// 8 bits int PackMatrixA_6r
void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer) { int32_t lda, int8_t *buffer) {
const int32_t i_length = m - m_tail; const int32_t i_length = m - m_tail;
for (int32_t i = 0; i < i_length; i += MR) { for (int32_t i = 0; i < i_length; i += MR_INT8) {
const int8_t *a0 = A + i * lda; const int8_t *a0 = A + i * lda;
const int8_t *a1 = A + (i + 1) * lda; const int8_t *a1 = A + (i + 1) * lda;
const int8_t *a2 = A + (i + 2) * lda; const int8_t *a2 = A + (i + 2) * lda;
...@@ -539,6 +681,9 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, ...@@ -539,6 +681,9 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
for (int32_t i = 0; i < k; ++i) { for (int32_t i = 0; i < k; ++i) {
const int8_t *b0 = &B(i, j); const int8_t *b0 = &B(i, j);
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__
// TODO
#else
asm volatile( asm volatile(
// "pld [%[b0]] \n\t" // "pld [%[b0]] \n\t"
"vld1.s8 {d0}, [%[b0]] \n\t" "vld1.s8 {d0}, [%[b0]] \n\t"
...@@ -546,6 +691,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, ...@@ -546,6 +691,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
: [local_buffer] "+r"(local_buffer) : [local_buffer] "+r"(local_buffer)
: [b0] "r"(b0) : [b0] "r"(b0)
: "memory", "q0"); : "memory", "q0");
#endif // __aarch64__
#else #else
*local_buffer++ = *b0++; *local_buffer++ = *b0++;
*local_buffer++ = *b0++; *local_buffer++ = *b0++;
...@@ -585,13 +731,13 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, ...@@ -585,13 +731,13 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
MC = L1 / (KC * sizeof(int8_t)); MC = L1 / (KC * sizeof(int8_t));
NC = L2 / (KC * sizeof(int8_t)); NC = L2 / (KC * sizeof(int8_t));
// make sure MC is multiple of MR, and NC is multiple of NR // make sure MC is multiple of MR_INT8, and NC is multiple of NR
if (MC == 0) { if (MC == 0) {
MC = MR; MC = MR_INT8;
} else { } else {
int32_t mblock_num = (m + MC - 1) / MC; int32_t mblock_num = (m + MC - 1) / MC;
MC = (m + mblock_num - 1) / mblock_num; MC = (m + mblock_num - 1) / mblock_num;
MC = (MC + MR - 1) / MR * MR; MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
} }
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if (NC == 0) { if (NC == 0) {
...@@ -618,7 +764,8 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, ...@@ -618,7 +764,8 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8); PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) { for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA_int8); // PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) { if (bias == nullptr) {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
packedC_int8, &C(i, j), ldc, relu, nullptr); packedC_int8, &C(i, j), ldc, relu, nullptr);
...@@ -643,6 +790,9 @@ void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, ...@@ -643,6 +790,9 @@ void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc) { int32_t ldc) {
#if __ARM_NEON #if __ARM_NEON
#if __aarch64__
// TODO
#else
int32_t nc1 = nc >> 4; int32_t nc1 = nc >> 4;
int32_t _nc1 = nc & 15; int32_t _nc1 = nc & 15;
int32_t step = sizeof(int32_t) * ldc; int32_t step = sizeof(int32_t) * ldc;
...@@ -696,6 +846,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, ...@@ -696,6 +846,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
} }
} }
} }
#endif // __aarch64__
#endif // __ARM_NEON #endif // __ARM_NEON
} }
......
...@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam { ...@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam {
}; };
#endif #endif
#ifdef POLYGONBOXTRANSFORM_OP
template <typename Dtype>
class PolygonBoxTransformParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
PolygonBoxTransformParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_ = InputFrom<GType>(inputs, scope);
output_ = OutputFrom<GType>(outputs, scope);
}
const RType *Input() const { return input_; }
RType *Output() const { return output_; }
private:
RType *input_;
RType *output_;
};
#endif
template <typename Dtype> template <typename Dtype>
class FeedParam : public OpParam { class FeedParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType; typedef typename DtypeTensorTrait<Dtype>::gtype GType;
...@@ -2272,6 +2294,7 @@ class ShapeParam : public OpParam { ...@@ -2272,6 +2294,7 @@ class ShapeParam : public OpParam {
}; };
#endif #endif
#ifdef QUANT_OP
template <typename Dtype> template <typename Dtype>
class QuantizeParam : public OpParam { class QuantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType; typedef typename DtypeTensorTrait<Dtype>::gtype GType;
...@@ -2311,7 +2334,9 @@ class QuantizeParam : public OpParam { ...@@ -2311,7 +2334,9 @@ class QuantizeParam : public OpParam {
// nearest_zero and nearest_even is valid currently // nearest_zero and nearest_even is valid currently
RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO; RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
}; };
#endif
#ifdef DEQUANT_OP
template <typename Dtype> template <typename Dtype>
class DequantizeParam : public OpParam { class DequantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType; typedef typename DtypeTensorTrait<Dtype>::gtype GType;
...@@ -2339,6 +2364,7 @@ class DequantizeParam : public OpParam { ...@@ -2339,6 +2364,7 @@ class DequantizeParam : public OpParam {
RType *activation_scale_; RType *activation_scale_;
float weight_scale_; float weight_scale_;
}; };
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#include "operators/polygon_box_transform_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void PolygonBoxTransformOp<Dtype, T>::InferShape() const {
PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
"Input (Input) of get_shape op should not be null.");
PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
"Output (Output) of get_shape op should not be null.");
auto input_dims = this->param_.Input()->dims();
PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4.");
PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0,
"input's second dimension must be even.");
this->param_.Output()->Resize(input_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/polygon_box_transform_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T>
class PolygonBoxTransformOp
: public framework::OperatorWithKernel<
DeviceType, PolygonBoxTransformParam<DeviceType>,
operators::PolygonBoxTransformKernel<DeviceType, T>> {
public:
PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, PolygonBoxTransformParam<DeviceType>,
operators::PolygonBoxTransformKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, PolygonBoxTransformParam<DeviceType>,
operators::PolygonBoxTransformKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -181,6 +181,10 @@ if (NOT FOUND_MATCH) ...@@ -181,6 +181,10 @@ if (NOT FOUND_MATCH)
ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
target_link_libraries(test-multiclassnms-op paddle-mobile) target_link_libraries(test-multiclassnms-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
target_link_libraries(test-polygon-box-transform-op paddle-mobile)
# gen test # gen test
ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h) ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
target_link_libraries(test-reshape-op paddle-mobile) target_link_libraries(test-reshape-op paddle-mobile)
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h" #include "../test_helper.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/mul_op.h" #include "operators/mul_op.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "../test_include.h"
#include "operators/polygon_box_transform_op.h"
namespace paddle_mobile {
namespace framework {
template <typename Dtype>
class TestPolygonBoxTransformOp {
public:
explicit TestPolygonBoxTransformOp(const Program<Dtype> p) : program_(p) {
if (use_optimize_) {
to_predict_program_ = program_.optimizeProgram;
} else {
to_predict_program_ = program_.originProgram;
}
const std::vector<std::shared_ptr<BlockDesc>> blocks =
to_predict_program_->Blocks();
for (auto block_desc : blocks) {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (auto op : ops) {
if (op->Type() == "polygon_box_transform") {
DLOG << " attr size: " << op->GetAttrMap().size();
DLOG << " inputs size: " << op->GetInputs().size();
DLOG << " input is : " << op->Input("Input")[0];
input_var_name = op->Input("Input")[0];
DLOG << " outputs size: " << op->GetOutputs().size();
DLOG << " output is : " << op->Output("Output")[0];
output_var_name = op->Output("Output")[0];
std::shared_ptr<operators::PolygonBoxTransformOp<Dtype, float>>
op_ptr = std::make_shared<
operators::PolygonBoxTransformOp<Dtype, float>>(
op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), program_.scope);
ops_of_block_[*block_desc.get()].push_back(op_ptr);
return;
}
}
}
}
std::shared_ptr<Tensor> predict(const Tensor &t) {
auto scope = program_.scope;
Variable *input_feed_value = scope->Var(input_var_name);
auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
tensor_input->ShareDataWith(t);
Variable *output = scope->Var(output_var_name);
auto *output_tensor = output->GetMutable<LoDTensor>();
std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
out_tensor.reset(output_tensor);
predict(t, 0);
return out_tensor;
}
private:
const framework::Program<Dtype> program_;
std::shared_ptr<ProgramDesc> to_predict_program_;
std::map<framework::BlockDesc,
std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
ops_of_block_;
bool use_optimize_ = false;
string input_var_name;
string output_var_name;
void predict(const Tensor &t, int block_id) {
std::shared_ptr<BlockDesc> to_predict_block =
to_predict_program_->Block(block_id);
for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
auto op = ops_of_block_[*to_predict_block.get()][j];
op->Run();
}
}
};
template class TestPolygonBoxTransformOp<CPU>;
} // namespace framework
} // namespace paddle_mobile
int main() {
DLOG << "----------**********----------";
DLOG << "begin to run PolygonBoxTransform Test";
paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_ocr));
paddle_mobile::framework::Tensor input;
SetupTensor<float>(&input, {1, 8, 1, 2}, static_cast<float>(0),
static_cast<float>(1));
auto *input_ptr = input.data<float>();
for (int i = 0; i < 16; ++i) {
*(input_ptr + i) = i;
}
DLOG << "input : ";
for (int i = 0; i < input.numel(); ++i) {
DLOG << " index " << i << " : " << input_ptr[i];
}
paddle_mobile::framework::TestPolygonBoxTransformOp<paddle_mobile::CPU>
testPolygonBoxTransformOp(program);
auto output = testPolygonBoxTransformOp.predict(input);
auto *output_ptr = output->data<float>();
DLOG << "output : ";
for (int i = 0; i < output->numel(); ++i) {
DLOG << " index " << i << " : " << output_ptr[i];
}
return 0;
}
...@@ -195,6 +195,7 @@ if(NOT FOUND_MATCH) ...@@ -195,6 +195,7 @@ if(NOT FOUND_MATCH)
set(LRN_OP ON) set(LRN_OP ON)
set(MUL_OP ON) set(MUL_OP ON)
set(MULTICLASSNMS_OP ON) set(MULTICLASSNMS_OP ON)
set(POLYGONBOXTRANSFORM_OP ON)
set(POOL_OP ON) set(POOL_OP ON)
set(PRIORBOX_OP ON) set(PRIORBOX_OP ON)
set(RELU_OP ON) set(RELU_OP ON)
...@@ -238,6 +239,7 @@ endif() ...@@ -238,6 +239,7 @@ endif()
# option(LRN_OP "" ON) # option(LRN_OP "" ON)
# option(MUL_OP "" ON) # option(MUL_OP "" ON)
# option(MULTICLASSNMS_OP "" ON) # option(MULTICLASSNMS_OP "" ON)
# option(POLYGONBOXTRANSFORM_OP "" ON)
# option(POOL_OP "" ON) # option(POOL_OP "" ON)
# option(PRIORBOX_OP "" ON) # option(PRIORBOX_OP "" ON)
# option(RELU_OP "" ON) # option(RELU_OP "" ON)
...@@ -292,6 +294,9 @@ endif() ...@@ -292,6 +294,9 @@ endif()
if (MULTICLASSNMS_OP) if (MULTICLASSNMS_OP)
add_definitions(-DMULTICLASSNMS_OP) add_definitions(-DMULTICLASSNMS_OP)
endif() endif()
if (POLYGONBOXTRANSFORM_OP)
add_definitions(-DPOLYGONBOXTRANSFORM_OP)
endif()
if (POOL_OP) if (POOL_OP)
add_definitions(-DPOOL_OP) add_definitions(-DPOOL_OP)
endif() endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册