add vector matrix multiplication in Gemm

58183ed5 · zhaojiaying01 · 85d7f010 · 58183ed5 · 58183ed5 · 58183ed5
隐藏空白更改
内联并排

Showing with 5 addition and 8 deletion

src/memory/t_malloc.cpp src/memory/t_malloc.cpp +3 -3

src/operators/math/gemm.cpp src/operators/math/gemm.cpp +1 -4

src/operators/math/gemm.h src/operators/math/gemm.h +1 -1

未找到文件。
--- a/src/memory/t_malloc.cpp
+++ b/src/memory/t_malloc.cpp
@@ -14,17 +14,17 @@ limitations under the License. */
 #pragma once
-#include "t_malloc.h"
+#include "memory/t_malloc.h"
 #include <cstdlib>
 #include <cstring>
 namespace paddle_mobile {
 namespace memory {
-const int MALLOC_ALIGN = 64;
+const int MALLOC_ALIGN = 16;
 void Copy(void *dst, const void *src, size_t num) {
  std::memcpy(dst, src, num);
-};
+}
 void *Alloc(size_t size) {
  size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -216,7 +216,7 @@ void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
  }
 }
-//计算一个更小的 4 * 4 的 C 矩阵分块
+// 计算一个更小的 4 * 4 的 C 矩阵分块
 #if defined(IOS)
 void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
               int ldb, float beta, float *C, int ldc, int mc, int nc) {
@@ -822,9 +822,6 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
  int _nc1 = n % 16;
  int volatile nc2 = _nc1 / 4;
  int volatile nc3 = _nc1 % 4;
-  //  DLOG << "GEMM VECTOR kc1 = " << kc1 << ", kc2 = " << kc2;
-  //  DLOG << "GEMM VECTOR nc1 = " << nc1 << ", nc2 = " << nc2 << ", nc3 = " <<
-  //  nc3;
  for (int i = 0; i < kc1; i++) {
    a0 = A + i * 4;
    b0 = B + i * 4 * ldb;

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -55,7 +55,7 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                   const float *B, int ldb, float beta, float *C, int ldc);
+                  const float *B, int ldb, float beta, float *C, int ldc);
 // 计算一个更小的 4 * 4 的 C 矩阵分块
 void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,