提交 58183ed5 编写于 作者: Z zhaojiaying01

add vector matrix multiplication in Gemm

上级 85d7f010
...@@ -14,17 +14,17 @@ limitations under the License. */ ...@@ -14,17 +14,17 @@ limitations under the License. */
#pragma once #pragma once
#include "t_malloc.h" #include "memory/t_malloc.h"
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
namespace paddle_mobile { namespace paddle_mobile {
namespace memory { namespace memory {
const int MALLOC_ALIGN = 64; const int MALLOC_ALIGN = 16;
void Copy(void *dst, const void *src, size_t num) { void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
}; }
void *Alloc(size_t size) { void *Alloc(size_t size) {
size_t offset = sizeof(void *) + MALLOC_ALIGN - 1; size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
......
...@@ -216,7 +216,7 @@ void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -216,7 +216,7 @@ void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
} }
} }
//计算一个更小的 4 * 4 的 C 矩阵分块 // 计算一个更小的 4 * 4 的 C 矩阵分块
#if defined(IOS) #if defined(IOS)
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b, void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) { int ldb, float beta, float *C, int ldc, int mc, int nc) {
...@@ -822,9 +822,6 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -822,9 +822,6 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
int _nc1 = n % 16; int _nc1 = n % 16;
int volatile nc2 = _nc1 / 4; int volatile nc2 = _nc1 / 4;
int volatile nc3 = _nc1 % 4; int volatile nc3 = _nc1 % 4;
// DLOG << "GEMM VECTOR kc1 = " << kc1 << ", kc2 = " << kc2;
// DLOG << "GEMM VECTOR nc1 = " << nc1 << ", nc2 = " << nc2 << ", nc3 = " <<
// nc3;
for (int i = 0; i < kc1; i++) { for (int i = 0; i < kc1; i++) {
a0 = A + i * 4; a0 = A + i * 4;
b0 = B + i * 4 * ldb; b0 = B + i * 4 * ldb;
......
...@@ -55,7 +55,7 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda, ...@@ -55,7 +55,7 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc); const float *B, int ldb, float beta, float *C, int ldc);
// 计算一个更小的 4 * 4 的 C 矩阵分块 // 计算一个更小的 4 * 4 的 C 矩阵分块
void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B, void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册