From b4d653b08c397df5187425037036d2a8b3e65dca Mon Sep 17 00:00:00 2001 From: zhaojiaying01 Date: Mon, 25 Jun 2018 16:20:42 +0800 Subject: [PATCH] add vector matrix multiplication in Gemm --- src/memory/t_malloc.cpp | 6 +++--- src/operators/math/gemm.cpp | 5 +---- src/operators/math/gemm.h | 2 +- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/src/memory/t_malloc.cpp b/src/memory/t_malloc.cpp index 92cd9ac036..4cb28d55d3 100644 --- a/src/memory/t_malloc.cpp +++ b/src/memory/t_malloc.cpp @@ -14,17 +14,17 @@ limitations under the License. */ #pragma once -#include "t_malloc.h" +#include "memory/t_malloc.h" #include #include namespace paddle_mobile { namespace memory { -const int MALLOC_ALIGN = 64; +const int MALLOC_ALIGN = 16; void Copy(void *dst, const void *src, size_t num) { std::memcpy(dst, src, num); -}; +} void *Alloc(size_t size) { size_t offset = sizeof(void *) + MALLOC_ALIGN - 1; diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index 7c42d6dce7..da3dacb58a 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -216,7 +216,7 @@ void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda, } } -//计算一个更小的 4 * 4 的 C 矩阵分块 +// 计算一个更小的 4 * 4 的 C 矩阵分块 #if defined(IOS) void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, int ldb, float beta, float *C, int ldc, int mc, int nc) { @@ -822,9 +822,6 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, int _nc1 = n % 16; int volatile nc2 = _nc1 / 4; int volatile nc3 = _nc1 % 4; - // DLOG << "GEMM VECTOR kc1 = " << kc1 << ", kc2 = " << kc2; - // DLOG << "GEMM VECTOR nc1 = " << nc1 << ", nc2 = " << nc2 << ", nc3 = " << - // nc3; for (int i = 0; i < kc1; i++) { a0 = A + i * 4; b0 = B + i * 4 * ldb; diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index b5351dd1e8..73d773987b 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -55,7 +55,7 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, // 向量矩阵乘法 (M = 1) void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, int ldc); + const float *B, int ldb, float beta, float *C, int ldc); // 计算一个更小的 4 * 4 的 C 矩阵分块 void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B, -- GitLab