提交 a18c0d42 编写于 作者: T tensor-tang

cache fc kernel

test=develop
上级 6e1ee7fb
...@@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M, ...@@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
return; return;
} }
if (relu) { if (relu) {
auto compute = auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>,
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(N); platform::CPUPlace>::Cache()
.At(N);
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
T* dst = Y + i * N; T* dst = Y + i * N;
compute(B, dst, dst, N); compute(B, dst, dst, N);
} }
} else { } else {
auto compute = auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>,
jit::Get<jit::kVAdd, jit::XYZNTuples<T>, platform::CPUPlace>(N); platform::CPUPlace>::Cache()
.At(N);
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for #pragma omp parallel for
#endif #endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册