提交 c32daf1d 编写于 作者: Z Zhen Wang

Gemm speed up: int8_t/float = 1.27(Snapdragon 835).

上级 cb8ea145
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <stdint-gcc.h>
#include <string> #include <string>
#include "common/log.h" #include "common/log.h"
......
...@@ -235,8 +235,8 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, ...@@ -235,8 +235,8 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
: :
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step) [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__ #endif // __aarch64__
#endif // __ARM_NEON #endif // __ARM_NEON
} }
...@@ -546,7 +546,7 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, ...@@ -546,7 +546,7 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
#pragma omp parallel for #pragma omp parallel for
for (int32_t j = 0; j < nc; j += NR) { for (int32_t j = 0; j < nc; j += NR) {
for (int32_t i = 0; i < mc; i += MR_INT8) { for (int32_t i = 0; i < mc; i += MR_INT8) {
// AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); // AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
} }
} }
...@@ -764,7 +764,7 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, ...@@ -764,7 +764,7 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8); PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8);
for (int32_t i = 0; i < m; i += MC) { for (int32_t i = 0; i < m; i += MC) {
mc = s_min(m - i, MC); mc = s_min(m - i, MC);
// PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8); // PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8); PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
if (bias == nullptr) { if (bias == nullptr) {
InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
......
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <stdint-gcc.h>
#include "../test_helper.h" #include "../test_helper.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/mul_op.h" #include "operators/mul_op.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册