提交 a18c0d42 编写于 作者: T tensor-tang

cache fc kernel

test=develop
上级 6e1ee7fb
......@@ -30,15 +30,17 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
return;
}
if (relu) {
auto compute =
jit::Get<jit::kVAddRelu, jit::XYZNTuples<T>, platform::CPUPlace>(N);
auto compute = jit::KernelFuncs<jit::kVAddRelu, jit::XYZNTuples<T>,
platform::CPUPlace>::Cache()
.At(N);
for (int i = 0; i < M; i++) {
T* dst = Y + i * N;
compute(B, dst, dst, N);
}
} else {
auto compute =
jit::Get<jit::kVAdd, jit::XYZNTuples<T>, platform::CPUPlace>(N);
auto compute = jit::KernelFuncs<jit::kVAdd, jit::XYZNTuples<T>,
platform::CPUPlace>::Cache()
.At(N);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册