diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 1b9360afcecf63ff0c3e306cdf303cc426e80f1e..97ddf223aefcdfaf8a488f93a152336c1ed458f4 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -93,6 +93,7 @@ std::vector TestSizes() { template struct BenchFunc { // return this function avg time + // TODO(TJ): clear cache every time double operator()(const typename KernelTuples::func_type tgt, Args... args) { for (int i = 0; i < FLAGS_burning; ++i) { tgt(args...); @@ -172,6 +173,9 @@ void BenchXYZNKernel() { RandomVec(d, y_data); BenchAllImpls, PlaceType>(d, x.data(), y.data(), z_data, d); + // test inplace + BenchAllImpls, PlaceType>(d, x.data(), z_data, + z_data, d); } } diff --git a/paddle/fluid/operators/jit/gen/blas.cc b/paddle/fluid/operators/jit/gen/blas.cc index dee6c7b9d3ee9756c1b11d10d55fdca341cbee85..5da24c359edd2df93333fe0ca8a18cdc7385aadb 100644 --- a/paddle/fluid/operators/jit/gen/blas.cc +++ b/paddle/fluid/operators/jit/gen/blas.cc @@ -155,7 +155,7 @@ class NCHW16CMulNCCreator : public JitCodeCreator { class name##Creator : public JitCodeCreator { \ public: \ bool UseMe(const int& attr) const override { \ - return platform::MayIUse(platform::avx); \ + return platform::MayIUse(platform::avx) && attr <= 1024; \ } \ size_t CodeSize(const int& d) const override { \ return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ diff --git a/paddle/fluid/operators/jit/gen/blas.h b/paddle/fluid/operators/jit/gen/blas.h index de6b33f467279124d7acd97709516c31706ec4f9..66a97c1be503b0fa983f9a7ec3b61c986774f16b 100644 --- a/paddle/fluid/operators/jit/gen/blas.h +++ b/paddle/fluid/operators/jit/gen/blas.h @@ -61,6 +61,7 @@ class VXXJitCode : public JitCode { base += "_Vec"; } base += (with_relu_ ? "_Relu" : ""); + base += "_D" + std::to_string(num_); return base.c_str(); } void genCode() override; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index bba3a13619619b6de3f797a4efc4a0d09c3b281f..d5773d65940127ea0a9b77ed2760bd371b778f4c 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -118,26 +118,33 @@ typename KernelTuples::func_type Get( return GetRefer(); } -template -class KernelFuncsCache { +template +class KernelFuncs { public: - KernelFuncsCache() = default; - static KernelFuncsCache& Instance() { - static thread_local KernelFuncsCache g_func_cache; + KernelFuncs() = default; + static KernelFuncs& Cache() { + static thread_local KernelFuncs g_func_cache; return g_func_cache; } bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } - typename KernelTuples::func_type At(int key) { return funcs_.at(key); } - void Insert(int key, typename KernelTuples::func_type func) { funcs_.emplace(key, func); } + typename KernelTuples::func_type At(int key) { + if (Has(key)) { + return funcs_.at(key); + } + auto func = Get(key); + Insert(key, func); + return func; + } + private: std::unordered_map funcs_; - DISABLE_COPY_AND_ASSIGN(KernelFuncsCache); + DISABLE_COPY_AND_ASSIGN(KernelFuncs); }; const char* to_string(KernelType kt); diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index 0f42ac158ca7926981df55936cb903d5f4ae4806..0036d1c238b17768c4df61af22a85588990e1815 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -49,49 +49,16 @@ void VTanh(const T* x, T* y, int n) { } void Softmax(const T* x, T* y, int n, int bs) { - typename XRNTuples::func_type compute_hmax{nullptr}; - typename XRNTuples::func_type compute_hsum{nullptr}; - typename AXYNTuples::func_type compute_vscal{nullptr}; - typename AXYNTuples::func_type compute_vaddbias{nullptr}; - typename XYNTuples::func_type compute_vexp{nullptr}; - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_hmax = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, compute_hmax); - } else { - compute_hmax = KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_hsum = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, compute_hsum); - } else { - compute_hsum = KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_vscal = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, - compute_vscal); - } else { - compute_vscal = KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_vaddbias = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert( - n, compute_vaddbias); - } else { - compute_vaddbias = - KernelFuncsCache>::Instance().At(n); - } - - if (!KernelFuncsCache>::Instance().Has(n)) { - compute_vexp = Get, platform::CPUPlace>(n); - KernelFuncsCache>::Instance().Insert(n, compute_vexp); - } else { - compute_vexp = KernelFuncsCache>::Instance().At(n); - } + auto compute_hmax = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_hsum = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vscal = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vaddbias = + KernelFuncs, platform::CPUPlace>::Cache().At(n); + auto compute_vexp = + KernelFuncs, platform::CPUPlace>::Cache().At(n); for (int i = 0; i < bs; ++i) { T scalar; diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index c7d0215eda9d1e14fcad16da7b70f45824789266..4c999131ab116ebe3484355158993558b02cc4b2 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -136,7 +136,7 @@ bool VMulKernel::UseMe(const int& d) const { template <> bool VAddKernel::UseMe(const int& d) const { - return platform::MayIUse(platform::avx512f) && d > 512; + return platform::MayIUse(platform::avx) && d > 512; } template <> diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index cddd0a18db53a7ddf9ca14d5f373180586ef6a31..0ad57c51be79cd3577b43c9af777bff710308fac 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -30,15 +30,17 @@ inline void FCCompute(const BlasT& blas, const int M, return; } if (relu) { - auto compute = - jit::Get, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(N); for (int i = 0; i < M; i++) { T* dst = Y + i * N; compute(B, dst, dst, N); } } else { - auto compute = - jit::Get, platform::CPUPlace>(N); + auto compute = jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(N); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 1ff9ff684fc8001afb0f768a033b4c5bd1592702..a1cb3f972826a67721b00ce6df0ec48cc34d6e03 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -82,8 +82,9 @@ class SoftmaxFunctor> { const int kClassDim = 1; // 2D data. Batch x C auto compute_softmax = - jit::Get, platform::CPUPlace>( - in_dims[kClassDim]); + jit::KernelFuncs, + platform::CPUPlace>::Cache() + .At(in_dims[kClassDim]); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } };