提交 48369c6d 编写于 作者: Z Zhen Wang

before optimizing

上级 ff1cfec3
......@@ -60,19 +60,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"blt 1f \n\t"
"0: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vmov.s8 q2, #0 \n\t" // q2 used
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vdup.s8 d6, d0[0] \n\t"
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
......@@ -112,19 +106,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
......@@ -164,19 +152,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
......@@ -216,19 +198,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
......@@ -274,19 +250,13 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"blt 3f \n\t"
"2: \n\t"
"vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0
// used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B
// row1, q1
// used
"vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 used
"vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B row1, q1 used
"vmov.s8 q2, #0 \n\t" // q2 used
"vdup.s8 d6, d0[0] \n\t"
"vdup.s8 d7, d1[0] \n\t" // q3 used
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B
// row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B
// row1, q3
// free
"vmlal.s8 q2, d2, d6 \n\t" // A col00 * B row0
"vmlal.s8 q2, d3, d7 \n\t" // A col10 * B row1, q3 free
"vaddw.s16 q4, q4, d4 \n\t"
"vaddw.s16 q5, q5, d5 \n\t" // res row 0
"vmov.s8 q2, #0 \n\t"
......@@ -323,13 +293,12 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
"vmlal.s8 q2, d2, d6 \n\t"
"vmlal.s8 q2, d3, d7 \n\t"
"vaddw.s16 q14, q14, d4 \n\t"
"vaddw.s16 q15, q15, d5 \n\t" // res row 5
"vaddw.s16 q15, q15, d5 \n\t" // res row5
"subs %[kc3], %[kc3], #1 \n\t"
"bge 2b \n\t"
"3: \n\t" // odd, last
// row
"3: \n\t" // odd, last row
"subs %[kc4], %[kc4], #1 \n\t"
"blt 4f \n\t"
"vld1.s8 {d0}, [%[a_ptr]] \n\t"
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h"
#include "../test_include.h"
#include "operators/mul_op.h"
......@@ -73,12 +74,19 @@ int TestMulOP() {
}
}
int32_t eq = 0;
int32_t neq = 0;
for (int32_t i = 0; i < m * n; ++i) {
PADDLE_MOBILE_ENFORCE(
output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
if (static_cast<int>(output_data[i] == c[i])) {
++eq;
} else {
++neq;
}
DLOG << "Run MulOp successfully!";
}
DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq << " neq=" << neq;
delete op;
return 0;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册