提交 48369c6d 编写于 作者: Z Zhen Wang

before optimizing

上级 ff1cfec3
...@@ -60,19 +60,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -60,19 +60,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"blt 1f \n\t" "blt 1f \n\t"
"0: \n\t" "0: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used "vmov.s8 q2, #0 \n\t" // q2 used
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vdup.s8 d6, d0[0] \n\t" "vdup.s8 d6, d0[0] \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vdup.s8 d7, d1[0] \n\t" // q3 used "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
// row0 "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t" "vmov.s8 q2, #0 \n\t"
...@@ -112,19 +106,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -112,19 +106,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
// used "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used "vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t" "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
// row0 "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t" "vmov.s8 q2, #0 \n\t"
...@@ -164,19 +152,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -164,19 +152,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
// used "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used "vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t" "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
// row0 "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t" "vmov.s8 q2, #0 \n\t"
...@@ -216,19 +198,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -216,19 +198,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
// used "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used "vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t" "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
// row0 "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t" "vmov.s8 q2, #0 \n\t"
...@@ -274,19 +250,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -274,19 +250,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"blt 3f \n\t" "blt 3f \n\t"
"2: \n\t" "2: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
// used "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used "vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t" "vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used "vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
// row0 "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vaddw.s16 q4, q4, d4 \n\t" "vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0 "vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t" "vmov.s8 q2, #0 \n\t"
...@@ -323,13 +293,12 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -323,13 +293,12 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vmlal.s8 q2, d2, d6 \n\t" "vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t" "vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t" "vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5 "vaddw.s16 q15, q15, d5 \n\t" // res row5
"subs %[kc3], %[kc3], #1 \n\t" "subs %[kc3], %[kc3], #1 \n\t"
"bge 2b \n\t" "bge 2b \n\t"
"3: \n\t" // odd, last "3: \n\t" // odd, last row
// row
"subs %[kc4], %[kc4], #1 \n\t" "subs %[kc4], %[kc4], #1 \n\t"
"blt 4f \n\t" "blt 4f \n\t"
"vld1.s8 {d0}, [%[a_ptr]] \n\t" "vld1.s8 {d0}, [%[a_ptr]] \n\t"
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h" #include "../test_helper.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/mul_op.h" #include "operators/mul_op.h"
...@@ -73,12 +74,19 @@ int TestMulOP() { ...@@ -73,12 +74,19 @@ int TestMulOP() {
} }
} }
int32_t eq = 0;
int32_t neq = 0;
for (int32_t i = 0; i < m * n; ++i) { for (int32_t i = 0; i < m * n; ++i) {
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i, output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i])); static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) {
++eq;
} else {
++neq;
} }
DLOG << "Run MulOp successfully!"; }
DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq << " neq=" << neq;
delete op; delete op;
return 0; return 0;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册