提交 58183ed5 编写于 作者: Z zhaojiaying01

add vector matrix multiplication in Gemm

上级 85d7f010
......@@ -14,17 +14,17 @@ limitations under the License. */
#pragma once
#include "t_malloc.h"
#include "memory/t_malloc.h"
#include <cstdlib>
#include <cstring>
namespace paddle_mobile {
namespace memory {
const int MALLOC_ALIGN = 64;
const int MALLOC_ALIGN = 16;
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
};
}
void *Alloc(size_t size) {
size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
......
......@@ -216,7 +216,7 @@ void InnerKernel_relu(int m, int n, int k, float alpha, const float *A, int lda,
}
}
//计算一个更小的 4 * 4 的 C 矩阵分块
// 计算一个更小的 4 * 4 的 C 矩阵分块
#if defined(IOS)
void AddDot4x4(int k, float alpha, const float *a, int lda, const float *b,
int ldb, float beta, float *C, int ldc, int mc, int nc) {
......@@ -822,9 +822,6 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
int _nc1 = n % 16;
int volatile nc2 = _nc1 / 4;
int volatile nc3 = _nc1 % 4;
// DLOG << "GEMM VECTOR kc1 = " << kc1 << ", kc2 = " << kc2;
// DLOG << "GEMM VECTOR nc1 = " << nc1 << ", nc2 = " << nc2 << ", nc3 = " <<
// nc3;
for (int i = 0; i < kc1; i++) {
a0 = A + i * 4;
b0 = B + i * 4 * ldb;
......
......@@ -55,7 +55,7 @@ void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc);
const float *B, int ldb, float beta, float *C, int ldc);
// 计算一个更小的 4 * 4 的 C 矩阵分块
void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册