diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 383532d8d227db308c01a160e177c8877719cdba..5c5a61f64093802697eb21452267471129c7fcf3 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -187,6 +187,9 @@ void BenchAXYNKernel() {
     RandomVec<T>(d, x_data);
     BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
                                                      d);
+    // test inplace
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data,
+                                                     d);
   }
 }
 
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
index a2a5661b93ad3d885983c502566860aa313d110f..e7a7375879064eb27c94315fe7b93eece7866b92 100644
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -81,9 +81,7 @@ void VActJitCode::genCode() {
 #define DECLARE_ACT_CREATOR(name)                                            \
   class name##Creator : public JitCodeCreator<int> {                         \
    public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
-      return platform::MayIUse(platform::avx);                               \
-    }                                                                        \
+    bool UseMe(const int& attr) const override;                              \
     size_t CodeSize(const int& d) const override;                            \
     std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
       return make_unique<name##JitCode>(attr, CodeSize(attr));               \
@@ -98,6 +96,30 @@ DECLARE_ACT_CREATOR(VSigmoid);
 DECLARE_ACT_CREATOR(VTanh);
 
 // TODO(TJ): tuning use me
+bool VReluCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VSquareCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VIdentityCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VExpCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx) && d < 32;
+}
+
+bool VSigmoidCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VTanhCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
 size_t VReluCreator::CodeSize(const int& d) const {
   return 96 /* init size */ +
          (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index fbf34fc4b3db49596b6be0360c00e77c12fab9b8..7bdc45779b7d39d36db0d52ca9361943cdcdef3e 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -118,6 +118,28 @@ typename KernelTuples::func_type Get(
   return GetRefer<KT, KernelTuples>();
 }
 
+template <KernelType KT, typename KernelTuples>
+class KernelFuncsCache {
+ public:
+  KernelFuncsCache() = default;
+  static KernelFuncsCache& Instance() {
+    static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache;
+    return g_func_cache;
+  }
+
+  bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
+
+  typename KernelTuples::func_type At(int key) { return funcs_.at(key); }
+
+  void Insert(int key, typename KernelTuples::func_type func) {
+    funcs_.emplace(key, func);
+  }
+
+ private:
+  std::unordered_map<int, typename KernelTuples::func_type> funcs_;
+  DISABLE_COPY_AND_ASSIGN(KernelFuncsCache);
+};
+
 const char* to_string(KernelType kt);
 const char* to_string(SeqPoolType kt);
 
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index 2a75eb23cdffb6f24ed4eaf3b38f13855ebeadcf..0f42ac158ca7926981df55936cb903d5f4ae4806 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -49,12 +49,50 @@ void VTanh(const T* x, T* y, int n) {
 }
 
 void Softmax(const T* x, T* y, int n, int bs) {
-  auto compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n);
-  auto compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
-  auto compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
-  auto compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
-  auto compute_vexp =
-      Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+  typename XRNTuples<T>::func_type compute_hmax{nullptr};
+  typename XRNTuples<T>::func_type compute_hsum{nullptr};
+  typename AXYNTuples<T>::func_type compute_vscal{nullptr};
+  typename AXYNTuples<T>::func_type compute_vaddbias{nullptr};
+  typename XYNTuples<T>::func_type compute_vexp{nullptr};
+
+  if (!KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Has(n)) {
+    compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Insert(n, compute_hmax);
+  } else {
+    compute_hmax = KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Has(n)) {
+    compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Insert(n, compute_hsum);
+  } else {
+    compute_hsum = KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Has(n)) {
+    compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Insert(n,
+                                                               compute_vscal);
+  } else {
+    compute_vscal = KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Has(n)) {
+    compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Insert(
+        n, compute_vaddbias);
+  } else {
+    compute_vaddbias =
+        KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Has(n)) {
+    compute_vexp = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Insert(n, compute_vexp);
+  } else {
+    compute_vexp = KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().At(n);
+  }
+
   for (int i = 0; i < bs; ++i) {
     T scalar;
     compute_hmax(x, &scalar, n);
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index b13b8638e28b48208de5bc0058a20a009f6cf0b0..28a37198dae19a57509934ec784746bc23436e7a 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -179,7 +179,8 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
 
 template <>
 bool SoftmaxKernel<float>::UseMe(const int& d) const {
-  return true;
+  // tuned on avx2
+  return platform::MayIUse(platform::avx) && d < 60;
 }
 
 #define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 6bbb7155dda9b2c844f793a63adb861c2ed956e8..e20524012a5839fd250b7426a5efc42b7e87fe87 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -53,7 +53,7 @@ math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
-math_library(softmax DEPS math_function)
+math_library(softmax DEPS math_function jit_kernel_helper)
 math_library(beam_search DEPS math_function)
 
 math_library(matrix_bit_code)
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 1d9d98b10646af9e199f6c481740d30745888707..1ff9ff684fc8001afb0f768a033b4c5bd1592702 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 
-#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -81,28 +81,10 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
     const int kBatchDim = 0;
     const int kClassDim = 1;
     // 2D data. Batch x C
-    const int batch_size = in_dims[kBatchDim];
-    const int num_classes = in_dims[kClassDim];
-    std::vector<float> entities(batch_size);
-    auto blas = math::GetBlas<DeviceContext, float>(context);
-    for (int n = 0; n < batch_size; ++n) {
-      entities[n] = in_data[n * num_classes];
-      for (int c = 1; c < num_classes; ++c) {
-        entities[n] = in_data[n * num_classes + c] > entities[n]
-                          ? in_data[n * num_classes + c]
-                          : entities[n];
-      }
-      for (int c = 0; c < num_classes; ++c) {
-        out_data[n * num_classes + c] =
-            in_data[n * num_classes + c] - entities[n];
-      }
-    }
-
-    blas.VEXP(num_classes * batch_size, out_data, out_data);
-    for (int n = 0; n < batch_size; ++n) {
-      auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);
-      blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);
-    }
+    auto compute_softmax =
+        jit::Get<jit::kSoftmax, jit::SoftmaxTuples<float>, platform::CPUPlace>(
+            in_dims[kClassDim]);
+    compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
   }
 };