diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 186c37c56ec9410ac9a31503e33e7e334d0afc40..5c5a61f64093802697eb21452267471129c7fcf3 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
 
 using Tensor = paddle::framework::Tensor;
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYZNKernel() {
   for (int d : TestSizes()) {
     Tensor x, y, z;
@@ -175,7 +175,7 @@ void BenchXYZNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchAXYNKernel() {
   for (int d : TestSizes()) {
     const T a = static_cast<T>(3);
@@ -187,10 +187,23 @@ void BenchAXYNKernel() {
     RandomVec<T>(d, x_data);
     BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
                                                      d);
+    // test inplace
+    BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data,
+                                                     d);
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchXRNKernel() {
+  for (int d : TestSizes()) {
+    Tensor x;
+    RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
+    T res;
+    BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d);
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchXYNKernel() {
   for (int d : TestSizes()) {
     Tensor x, y;
@@ -203,7 +216,7 @@ void BenchXYNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchLSTMKernel() {
   for (bool use_peephole : {true, false}) {
     for (int d : TestSizes()) {
@@ -240,7 +253,7 @@ void BenchLSTMKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchGRUKernel() {
   for (int d : TestSizes()) {
     const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
@@ -262,7 +275,7 @@ void BenchGRUKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchSeqPoolKernel() {
   std::vector<jit::SeqPoolType> pool_types = {
       jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
@@ -284,7 +297,7 @@ void BenchSeqPoolKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
   for (int m : {1, 2, 3, 4}) {
     for (int n : TestSizes()) {
@@ -305,57 +318,64 @@ void BenchMatMulKernel() {
   }
 }
 
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchSoftmaxKernel() {
+  for (int bs : {1, 2, 10}) {
+    for (int n : TestSizes()) {
+      Tensor x, y;
+      x.Resize({bs, n});
+      y.Resize({bs, n});
+      RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
+      const T* x_data = x.data<T>();
+      T* y_data = y.mutable_data<T>(PlaceType());
+      BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
+                                                          bs);
+    }
+  }
+}
+
 using T = float;
-using PlaceType = paddle::platform::CPUPlace;
+using CPUPlace = paddle::platform::CPUPlace;
 
 // xyzn
-BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, PlaceType>(); }
+BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
 
 // axyn
-BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, PlaceType>(); }
+BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); }
 
-BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, PlaceType>(); }
+// xrn
+BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); }
+BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
 
 // xyn
-BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, PlaceType>(); }
+BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
 
 // lstm and peephole
-BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>(); }
+BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
+BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
 
 // gru functions
-BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); }
-
-BENCH_FP32_CPU(kGRUHtPart1) {
-  BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
-}
-
-BENCH_FP32_CPU(kGRUHtPart2) {
-  BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
-}
+BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
+BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
+BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
 
 // seq pool function
-BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>(); }
+BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
 
 // matmul
-BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, PlaceType>(); }
+BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
+
+// softmax
+BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
 
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index 40310c2d2b372a414054f75348e8e1b4471bf3d2..2ea8f927e1a13867fa2065841fac05e766735237 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -28,3 +28,5 @@ USE_JITKERNEL_GEN(kGRUHtPart1)
 USE_JITKERNEL_GEN(kGRUHtPart2)
 USE_JITKERNEL_GEN(kNCHW16CMulNC)
 USE_JITKERNEL_GEN(kSeqPool)
+USE_JITKERNEL_GEN(kHMax)
+USE_JITKERNEL_GEN(kHSum)
diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc
index a2a5661b93ad3d885983c502566860aa313d110f..e7a7375879064eb27c94315fe7b93eece7866b92 100644
--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -81,9 +81,7 @@ void VActJitCode::genCode() {
 #define DECLARE_ACT_CREATOR(name)                                            \
   class name##Creator : public JitCodeCreator<int> {                         \
    public:                                                                   \
-    bool UseMe(const int& attr) const override {                             \
-      return platform::MayIUse(platform::avx);                               \
-    }                                                                        \
+    bool UseMe(const int& attr) const override;                              \
     size_t CodeSize(const int& d) const override;                            \
     std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
       return make_unique<name##JitCode>(attr, CodeSize(attr));               \
@@ -98,6 +96,30 @@ DECLARE_ACT_CREATOR(VSigmoid);
 DECLARE_ACT_CREATOR(VTanh);
 
 // TODO(TJ): tuning use me
+bool VReluCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VSquareCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VIdentityCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VExpCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx) && d < 32;
+}
+
+bool VSigmoidCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
+bool VTanhCreator::UseMe(const int& d) const {
+  return platform::MayIUse(platform::avx);
+}
+
 size_t VReluCreator::CodeSize(const int& d) const {
   return 96 /* init size */ +
          (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7884017198623d996fe98a55691da6e342d656a
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/hopv.h"
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void HOPVJitCode::genCode() {
+  const int num_blocks = num_ / YMM_FLOAT_BLOCK;
+  int offset = 0;
+
+  if (num_blocks > 0) {
+    // load one firstly
+    vmovups(ymm_tmp, ptr[param_src]);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+    for (int i = 1; i < num_blocks; ++i) {
+      vmovups(ymm_src, ptr[param_src + offset]);
+      process(ymm_tmp, ymm_src, ymm_tmp);
+      offset += sizeof(float) * YMM_FLOAT_BLOCK;
+    }
+    vextractf128(xmm_dst, ymm_tmp, 1);
+    process(xmm_dst, xmm_dst, xmm_tmp);
+  } else {
+    if (type_ == operand_type::MAX) {
+      vbroadcastss(ymm_dst, ptr[param_src]);
+    } else if (type_ == operand_type::ADD) {
+      vxorps(ymm_dst, ymm_dst, ymm_dst);
+    }
+  }
+
+  int rest = num_ % YMM_FLOAT_BLOCK;
+  if (rest >= 4) {
+    vmovups(xmm_src, ptr[param_src + offset]);
+    offset += sizeof(float) * 4;
+    rest -= 4;
+    process(xmm_dst, xmm_dst, xmm_src);
+  }
+
+  vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3);
+  process(xmm_dst, xmm_dst, xmm_tmp);
+
+  if (rest >= 2) {
+    vmovq(xmm_src, ptr[param_src + offset]);
+    offset += sizeof(float) * 2;
+    rest -= 2;
+    process(xmm_dst, xmm_dst, xmm_src);
+  }
+
+  vpermilps(xmm_tmp, xmm_dst, 1);
+  process(xmm_dst, xmm_dst, xmm_tmp);
+
+  if (rest >= 1) {
+    vmovss(xmm_src, ptr[param_src + offset]);
+    process(xmm_dst, xmm_dst, xmm_src);
+  }
+  vmovss(ptr[param_dst], xmm_dst);
+  ret();
+}
+
+#define DECLARE_HOP_CREATOR(name)                                            \
+  class name##Creator : public JitCodeCreator<int> {                         \
+   public:                                                                   \
+    bool UseMe(const int& attr) const override {                             \
+      return platform::MayIUse(platform::avx);                               \
+    }                                                                        \
+    size_t CodeSize(const int& d) const override {                           \
+      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \
+    }                                                                        \
+    std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));               \
+    }                                                                        \
+  }
+
+DECLARE_HOP_CREATOR(HMax);
+DECLARE_HOP_CREATOR(HSum);
+
+#undef DECLARE_HOP_CREATOR
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
+REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3bc94b63d3f962cd655367a2afe1a08582b06fa
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/hopv.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+// horizontal operand vector
+class HOPVJitCode : public JitCode {
+ public:
+  explicit HOPVJitCode(int d, operand_type type, size_t code_size = 256 * 1024,
+                       void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), num_(d), type_(type) {
+    if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) {
+      LOG(FATAL) << "Do not support this operand type: " << type_;
+    }
+    this->genCode();
+  }
+
+  virtual const char* name() const {
+    std::string base = "VXXJitCode";
+    if (type_ == operand_type::MAX) {
+      base += "_MAX";
+    } else {
+      base += "_SUM";
+    }
+    return base.c_str();
+  }
+  void genCode() override;
+
+ protected:
+  template <typename JMM>
+  void process(JMM& dst, JMM& src1, JMM& src2) {  // NOLINT
+    if (type_ == operand_type::MAX) {
+      vmaxps(dst, src1, src2);
+    } else if (type_ == operand_type::ADD) {
+      vaddps(dst, src1, src2);
+    }
+  }
+
+ private:
+  int num_;
+  operand_type type_;
+  reg64_t param_src{abi_param1};
+  reg64_t param_dst{abi_param2};
+  reg64_t param_attr{abi_param3};
+
+  ymm_t ymm_tmp = ymm_t(0);
+  ymm_t ymm_src = ymm_t(1);
+  ymm_t ymm_dst = ymm_t(2);
+
+  xmm_t xmm_tmp = xmm_t(0);
+  xmm_t xmm_src = xmm_t(1);
+  xmm_t xmm_dst = xmm_t(2);
+};
+
+#define DECLARE_HOP_JITCODE(name, op_type)                                    \
+  class name##JitCode : public HOPVJitCode {                                  \
+   public:                                                                    \
+    explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
+        : HOPVJitCode(d, op_type, code_size, code_ptr) {}                     \
+  };
+
+DECLARE_HOP_JITCODE(HMax, operand_type::MAX);
+DECLARE_HOP_JITCODE(HSum, operand_type::ADD);
+
+#undef DECLARE_HOP_JITCODE
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index f63d40ad5a559ab87a9b3735406671cfd936d9e4..c388109604bc57e8475e79a6c57eecb5bfebfb52 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -47,6 +47,7 @@ using Label = Xbyak::Label;
 
 typedef enum {
   MUL = 0,
+  MAX,
   ADD,
   SUB,
   RELU,
diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc
index 5dbe22a81b4866bdf60a03710d8ffd0b7bcb597b..4dac2f2460f72c7da63f48c82549b948cc253153 100644
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -49,6 +49,9 @@ const char* to_string(KernelType kt) {
     ONE_CASE(kNCHW16CMulNC);
     ONE_CASE(kSeqPool);
     ONE_CASE(kMatMul);
+    ONE_CASE(kHMax);
+    ONE_CASE(kHSum);
+    ONE_CASE(kSoftmax);
     default:
       PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
       return "NOT JITKernel";
diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h
index fbf34fc4b3db49596b6be0360c00e77c12fab9b8..7bdc45779b7d39d36db0d52ca9361943cdcdef3e 100644
--- a/paddle/fluid/operators/jit/helper.h
+++ b/paddle/fluid/operators/jit/helper.h
@@ -118,6 +118,28 @@ typename KernelTuples::func_type Get(
   return GetRefer<KT, KernelTuples>();
 }
 
+template <KernelType KT, typename KernelTuples>
+class KernelFuncsCache {
+ public:
+  KernelFuncsCache() = default;
+  static KernelFuncsCache& Instance() {
+    static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache;
+    return g_func_cache;
+  }
+
+  bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
+
+  typename KernelTuples::func_type At(int key) { return funcs_.at(key); }
+
+  void Insert(int key, typename KernelTuples::func_type func) {
+    funcs_.emplace(key, func);
+  }
+
+ private:
+  std::unordered_map<int, typename KernelTuples::func_type> funcs_;
+  DISABLE_COPY_AND_ASSIGN(KernelFuncsCache);
+};
+
 const char* to_string(KernelType kt);
 const char* to_string(SeqPoolType kt);
 
diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h
index adb101bd5cdf231ac330dbf44beb4c24c1fcf29e..42a58580f7b1e0832af57398ba9c29882b6cc6fb 100644
--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -20,6 +20,7 @@ namespace paddle {
 namespace operators {
 namespace jit {
 
+// TODO(TJ): reorder by alphabet
 typedef enum {
   kNone = 0,
   kVMul = 1,
@@ -44,6 +45,9 @@ typedef enum {
   kNCHW16CMulNC,
   kSeqPool,
   kMatMul,
+  kHSum,  // horizontal max
+  kHMax,  // horizontal sum
+  kSoftmax,
 } KernelType;
 
 typedef enum {
@@ -70,6 +74,10 @@ struct XYNTuples {
   typedef void (*func_type)(const T*, T*, int);
 };
 
+// x, return and int
+template <typename T>
+struct XRNTuples : public XYNTuples<T> {};
+
 typedef struct {
   void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
   const void* ct_1;
@@ -159,6 +167,13 @@ struct LayerNormTuples {
                             const float, int);
 };
 
+template <typename T>
+struct SoftmaxTuples {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, T*, int, int);
+};
+
 // nChw16c = nChw16c .* NC
 template <typename T>
 struct NCHW16CMulNCTuples {
diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
index e05f204b1eebd03c7a00157d96d0482f4a44a7fb..dd039d29152961210958470a48f086a133ab640c 100644
--- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt
@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix)
 USE_JITKERNEL_MORE(kGRUH1, mix)
 USE_JITKERNEL_MORE(kGRUHtPart1, mix)
 USE_JITKERNEL_MORE(kGRUHtPart2, mix)
+USE_JITKERNEL_MORE(kSoftmax, mix)
diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc
index df0a85256b1f546d5f64be73925cf58b87a25bd7..0f42ac158ca7926981df55936cb903d5f4ae4806 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -48,6 +48,65 @@ void VTanh(const T* x, T* y, int n) {
   compute_addbias(&b, y, y, n);
 }
 
+void Softmax(const T* x, T* y, int n, int bs) {
+  typename XRNTuples<T>::func_type compute_hmax{nullptr};
+  typename XRNTuples<T>::func_type compute_hsum{nullptr};
+  typename AXYNTuples<T>::func_type compute_vscal{nullptr};
+  typename AXYNTuples<T>::func_type compute_vaddbias{nullptr};
+  typename XYNTuples<T>::func_type compute_vexp{nullptr};
+
+  if (!KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Has(n)) {
+    compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Insert(n, compute_hmax);
+  } else {
+    compute_hmax = KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Has(n)) {
+    compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Insert(n, compute_hsum);
+  } else {
+    compute_hsum = KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Has(n)) {
+    compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Insert(n,
+                                                               compute_vscal);
+  } else {
+    compute_vscal = KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Has(n)) {
+    compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Insert(
+        n, compute_vaddbias);
+  } else {
+    compute_vaddbias =
+        KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().At(n);
+  }
+
+  if (!KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Has(n)) {
+    compute_vexp = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
+    KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Insert(n, compute_vexp);
+  } else {
+    compute_vexp = KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().At(n);
+  }
+
+  for (int i = 0; i < bs; ++i) {
+    T scalar;
+    compute_hmax(x, &scalar, n);
+    scalar = static_cast<T>(0) - scalar;
+    compute_vaddbias(&scalar, x, y, n);  // x - max
+    compute_vexp(y, y, n);
+    compute_hsum(y, &scalar, n);
+    scalar = static_cast<T>(1) / scalar;
+    compute_vscal(&scalar, y, y, n);
+    x += n;
+    y += n;
+  }
+}
+
 void (*getActFunc(KernelType type, int d))(const T*, T*, int) {  // NOLINT
   if (type == kVSigmoid) {
     return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d);
@@ -184,6 +243,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; }
 
 bool VTanhKernel::UseMe(const int& d) const { return true; }
 
+bool SoftmaxKernel::UseMe(const int& d) const { return true; }
+
 bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }
 
 bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }
@@ -207,6 +268,7 @@ namespace mix = paddle::operators::jit::more::mix;
 
 REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MORE_KERNEL(kVTanh, VTanh);
+REGISTER_MORE_KERNEL(kSoftmax, Softmax);
 REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt);
 REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1);
 REGISTER_MORE_KERNEL(kGRUH1, GRUH1);
diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h
index a70ecdf9348f511311307b4c27bb4506222a7439..d64af192197a0b339a39a1862c028875da2f3900 100644
--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -26,6 +26,7 @@ using T = float;
 
 void VSigmoid(const T* x, T* y, int n);
 void VTanh(const T* x, T* y, int n);
+void Softmax(const T* x, T* y, int n, int bs);
 
 void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
 void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
@@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
 DECLARE_MORE_KERNEL(VSigmoid, XYNTuples);
 DECLARE_MORE_KERNEL(VTanh, XYNTuples);
 
+// XRN
+DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples);
+
 DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples);
 DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples);
 
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
index 667c6dfad6676d00ab994564bff57c90caa0cb41..f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f 100644
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl)
 USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
+USE_JITKERNEL_MORE(kSoftmax, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index fccdc68f5efa34bac6f5a34a41569d2f77416284..28a37198dae19a57509934ec784746bc23436e7a 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -116,6 +116,16 @@ void VAXPY<double>(double a, const double* x, double* y, int n) {
   platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
 }
 
+template <>
+void ASum<float>(const float* x, float* res, int n) {
+  res[0] = platform::dynload::cblas_sasum(n, x, 1);
+}
+
+template <>
+void ASum<double>(const double* x, double* res, int n) {
+  res[0] = platform::dynload::cblas_dasum(n, x, 1);
+}
+
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool MatMulKernel<float>::UseMe(const int& d) const {
@@ -167,6 +177,12 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
   return true;
 }
 
+template <>
+bool SoftmaxKernel<float>::UseMe(const int& d) const {
+  // tuned on avx2
+  return platform::MayIUse(platform::avx) && d < 60;
+}
+
 #define AWALYS_USE_ME_WITH_DOUBLE(func)                  \
   template <>                                            \
   bool func##Kernel<double>::UseMe(const int& d) const { \
@@ -181,6 +197,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
 AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
 AWALYS_USE_ME_WITH_DOUBLE(VTanh);
 AWALYS_USE_ME_WITH_DOUBLE(VSquare);
+AWALYS_USE_ME_WITH_DOUBLE(Softmax);
 
 #undef AWALYS_USE_ME_WITH_DOUBLE
 }  // namespace mkl
@@ -204,5 +221,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
+REGISTER_MKL_KERNEL(kSoftmax, Softmax);
 
 #undef REGISTER_MKL_KERNEL
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index a27196fa19f1d3e9aa6c414b6b9f99a21ef49025..6b95b9c872dc12cccaef0b0737edd760447a47d0 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -16,6 +16,7 @@
 
 #include <cmath>
 #include <type_traits>
+#include <vector>
 #include "paddle/fluid/operators/jit/kernel_base.h"
 
 namespace paddle {
@@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
   }
 }
 
+template <typename T>
+void ASum(const T* x, T* res, int n);
+
+template <typename T>
+void Softmax(const T* x, T* y, int n, int bs) {
+  std::vector<T> entities(bs);
+  for (int i = 0; i < bs; ++i) {
+    entities[i] = x[i * n];
+    for (int c = 1; c < n; ++c) {
+      entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
+    }
+    for (int c = 0; c < n; ++c) {
+      y[i * n + c] = x[i * n + c] - entities[i];
+    }
+  }
+  VExp(y, y, n * bs);
+  for (int i = 0; i < bs; ++i) {
+    T sum;
+    ASum(&y[i * n], &sum, n);
+    sum = static_cast<T>(1) / sum;
+    VScal(&sum, &y[i * n], &y[i * n], n);
+  }
+}
+
 #define DECLARE_MKL_KERNEL(name, tuples)                             \
   template <typename T>                                              \
   class name##Kernel : public KernelMore<tuples<T>> {                \
@@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples);
 
 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
 
+DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
+
 #undef DECLARE_MKL_KERNEL
 
 }  // namespace mkl
diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt
index 4b9bc5e8d49c62404d5d4ef99b7c50987fcb415a..9f2935828ca300dbdb71b0fefb6b9883cb45e4b0 100644
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC)
 USE_JITKERNEL_REFER(kSeqPool)
 USE_JITKERNEL_REFER(kMatMul)
 USE_JITKERNEL_REFER(kVSquare)
+USE_JITKERNEL_REFER(kHSum)
+USE_JITKERNEL_REFER(kHMax)
+USE_JITKERNEL_REFER(kSoftmax)
diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc
index 3512ad7fe7921381afb6152330fff6be34de5ad7..b8adb40ec7e1b64df2b04a3201292db235af7b19 100644
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
 
 REGISTER_REFER_KERNEL(kMatMul, MatMul);
 
+REGISTER_REFER_KERNEL(kHMax, HMax);
+REGISTER_REFER_KERNEL(kHSum, HSum);
+
+REGISTER_REFER_KERNEL(kSoftmax, Softmax);
+
 #undef REGISTER_REFER_KERNEL
diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h
index 97d029358594d757f0e1874e9c87ecb8f97c9d50..5a074db7e0e8ab49dc281e1809edef23e6a25c42 100644
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -378,6 +378,40 @@ void MatMul(const T* A, const T* B, T* C, int M, int N, int K) {
   }
 }
 
+template <typename T>
+void HMax(const T* x, T* res, int n) {
+  res[0] = x[0];
+  for (int i = 1; i < n; ++i) {
+    res[0] = res[0] < x[i] ? x[i] : res[0];
+  }
+}
+
+template <typename T>
+void HSum(const T* x, T* res, int n) {
+  res[0] = x[0];
+  for (int i = 1; i < n; ++i) {
+    res[0] += x[i];
+  }
+}
+
+// y = e^(x - max(x))
+// y = y / sum(y)
+template <typename T>
+void Softmax(const T* x, T* y, int n, int bs = 1) {
+  for (int i = 0; i < bs; ++i) {
+    T scalar;
+    HMax(x, &scalar, n);
+    scalar = static_cast<T>(0) - scalar;
+    VAddBias(&scalar, x, y, n);  // x - max
+    VExp(y, y, n);
+    HSum(y, &scalar, n);
+    scalar = static_cast<T>(1) / scalar;
+    VScal(&scalar, y, y, n);
+    x += n;
+    y += n;
+  }
+}
+
 #define DECLARE_REFER_KERNEL(name, tuples)             \
   template <typename T>                                \
   class name##Kernel : public ReferKernel<tuples<T>> { \
@@ -421,6 +455,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
 
 DECLARE_REFER_KERNEL(MatMul, MatMulTuples);
 
+DECLARE_REFER_KERNEL(HMax, XRNTuples);
+DECLARE_REFER_KERNEL(HSum, XRNTuples);
+
+DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
+
 #undef DECLARE_REFER_KERNEL
 
 }  // namespace refer
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index 68a79b6314e4cf86f5b715b9c6694924126b12da..cc461552898fc68661ce548a520d65215d3572b4 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -61,6 +61,7 @@ std::vector<int> TestSizes() {
 }
 
 namespace jit = paddle::operators::jit;
+using CPUPlace = paddle::platform::CPUPlace;
 
 template <typename KernelTuples, typename... Args>
 struct TestFuncWithRefer {
@@ -121,6 +122,40 @@ struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
   }
 };
 
+template <typename T>
+struct TestFuncWithRefer<jit::SoftmaxTuples<T>, std::vector<T>, std::vector<T>,
+                         int, int> {
+  void operator()(const typename jit::SoftmaxTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const std::vector<T>& yref, int n,
+                  int bs) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(yref.size(), x.size());
+    EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
+    const T* x_data = x.data();
+    const T* yref_data = yref.data();
+    std::vector<T> ytgt(n * bs);
+    T* ytgt_data = ytgt.data();
+    // test normal
+    tgt(x_data, ytgt_data, n, bs);
+    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+    // test inplace x
+    std::copy(x.begin(), x.end(), ytgt.begin());
+    tgt(ytgt_data, ytgt_data, n, bs);
+    ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+  }
+};
+
+template <typename T>
+struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
+  void operator()(const typename jit::XRNTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const T ref_res) {
+    EXPECT_TRUE(tgt != nullptr);
+    T tgt_res;
+    tgt(x.data(), &tgt_res, x.size());
+    ExpectEQ<T>(&tgt_res, &ref_res, 1);
+  }
+};
+
 template <typename T>
 struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
   void operator()(const typename jit::XYNTuples<T>::func_type tgt,
@@ -172,7 +207,7 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
     T* ht_data = ht.data();
     T* checked_data = checked.data();
 
-    paddle::operators::jit::lstm_t step;
+    jit::lstm_t step;
     step.gates = x_data;
     step.ct_1 = ct_1_data;
     step.ct = ct_data;
@@ -208,7 +243,7 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
     const T* ht_ref_data = ht_ref.data();
     T* x_data = x.data();
     T* ht_data = ht.data();
-    paddle::operators::jit::gru_t step;
+    jit::gru_t step;
     step.gates = x_data;
     step.ht_1 = ht_1_data;
     step.ht = ht_data;
@@ -255,8 +290,8 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
   }
 };
 
-template <paddle::operators::jit::KernelType KT, typename KernelTuples,
-          typename PlaceType, typename... Args>
+template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
+          typename... Args>
 void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
   TestFuncWithRefer<KernelTuples, Args...> test;
   // test jitcode
@@ -286,9 +321,8 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
   test(tgt, args...);
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestXYZNKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
@@ -320,9 +354,8 @@ void TestXYZNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestAXYNKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
@@ -347,9 +380,26 @@ void TestAXYNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestXRNKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  auto last_acc = acc;
+  acc = 1e-4;
+  for (int d : TestSizes()) {
+    auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
+    EXPECT_TRUE(ref != nullptr);
+    std::vector<T> x(d);
+    RandomVec<T>(d, x.data(), -2.f, 2.f);
+    T ref_res;
+    ref(x.data(), &ref_res, d);
+    TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
+                                                                      ref_res);
+  }
+  acc = last_acc;
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestXYNKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   for (int d : TestSizes()) {
     auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
@@ -373,9 +423,8 @@ void TestXYNKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestLSTMKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
   for (int d : TestSizes()) {
@@ -424,9 +473,8 @@ void TestLSTMKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestGRUKernel() {
-  namespace jit = paddle::operators::jit;
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
   for (int d : TestSizes()) {
@@ -459,7 +507,7 @@ void TestGRUKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestSeqPoolKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   std::vector<jit::SeqPoolType> pool_types = {
@@ -484,7 +532,7 @@ void TestSeqPoolKernel() {
   }
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestMatMulKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   auto last_acc = acc;
@@ -510,7 +558,32 @@ void TestMatMulKernel() {
   acc = last_acc;
 }
 
-template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestSoftmaxKernel() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int bs : {1, 2, 10}) {
+    for (int n : TestSizes()) {
+      auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>();
+      EXPECT_TRUE(ref != nullptr);
+      std::vector<T> x(bs * n), y(bs * n);
+      RandomVec<T>(bs * n, x.data(), -2.f, 2.f);
+      const T* x_data = x.data();
+      T* y_data = y.data();
+
+      std::vector<T> xinp(x.size());  // inplace test
+      std::copy(x.begin(), x.end(), xinp.begin());
+      ref(x_data, y_data, n, bs);
+      T* xinp_data = xinp.data();
+      ref(xinp_data, xinp_data, n, bs);
+      ExpectEQ<T>(xinp_data, y_data, n * bs);
+
+      TestAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType, std::vector<T>,
+                   std::vector<T>>(n, x, y, n, bs);
+    }
+  }
+}
+
+template <jit::KernelType KT, typename T, typename PlaceType>
 void TestNCHW16CMulNCKernel() {
   VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
   const int n = 3, c = 16 * 4, h = 10, w = 10;
@@ -565,129 +638,123 @@ void TestNCHW16CMulNCKernel() {
 
 // XYZNTuple
 TEST(JITKernel, kVMul) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVMul, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVMul, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVMul, float, CPUPlace>();
+  TestXYZNKernel<jit::kVMul, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVAdd) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVAdd, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVAdd, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVAdd, float, CPUPlace>();
+  TestXYZNKernel<jit::kVAdd, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVAddRelu) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVAddRelu, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVAddRelu, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVAddRelu, float, CPUPlace>();
+  TestXYZNKernel<jit::kVAddRelu, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVSub) {
-  namespace jit = paddle::operators::jit;
-  TestXYZNKernel<jit::kVSub, float, paddle::platform::CPUPlace>();
-  TestXYZNKernel<jit::kVSub, double, paddle::platform::CPUPlace>();
+  TestXYZNKernel<jit::kVSub, float, CPUPlace>();
+  TestXYZNKernel<jit::kVSub, double, CPUPlace>();
 }
 
 // AXYNTuples
 TEST(JITKernel, kVScal) {
-  namespace jit = paddle::operators::jit;
-  TestAXYNKernel<jit::kVScal, float, paddle::platform::CPUPlace>();
-  TestAXYNKernel<jit::kVScal, double, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::kVScal, float, CPUPlace>();
+  TestAXYNKernel<jit::kVScal, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVAddBias) {
-  namespace jit = paddle::operators::jit;
-  TestAXYNKernel<jit::kVAddBias, float, paddle::platform::CPUPlace>();
-  TestAXYNKernel<jit::kVAddBias, double, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::kVAddBias, float, CPUPlace>();
+  TestAXYNKernel<jit::kVAddBias, double, CPUPlace>();
+}
+
+// XRNTuples
+TEST(JITKernel, kHMax) {
+  TestXRNKernel<jit::kHMax, float, CPUPlace>();
+  TestXRNKernel<jit::kHMax, double, CPUPlace>();
+}
+
+TEST(JITKernel, kHSum) {
+  TestXRNKernel<jit::kHSum, float, CPUPlace>();
+  TestXRNKernel<jit::kHSum, double, CPUPlace>();
 }
 
 // XYNTuples
 TEST(JITKernel, kVRelu) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVRelu, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVRelu, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVRelu, float, CPUPlace>();
+  TestXYNKernel<jit::kVRelu, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVIdentity) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVIdentity, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVIdentity, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVIdentity, float, CPUPlace>();
+  TestXYNKernel<jit::kVIdentity, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVSquare) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVSquare, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVSquare, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVSquare, float, CPUPlace>();
+  TestXYNKernel<jit::kVSquare, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVExp) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVExp, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVExp, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVExp, float, CPUPlace>();
+  TestXYNKernel<jit::kVExp, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVSigmoid) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVSigmoid, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVSigmoid, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVSigmoid, float, CPUPlace>();
+  TestXYNKernel<jit::kVSigmoid, double, CPUPlace>();
 }
 
 TEST(JITKernel, kVTanh) {
-  namespace jit = paddle::operators::jit;
-  TestXYNKernel<jit::kVTanh, float, paddle::platform::CPUPlace>();
-  TestXYNKernel<jit::kVTanh, double, paddle::platform::CPUPlace>();
+  TestXYNKernel<jit::kVTanh, float, CPUPlace>();
+  TestXYNKernel<jit::kVTanh, double, CPUPlace>();
 }
 
 // LSTM
 TEST(JITKernel, kLSTMCtHt) {
-  namespace jit = paddle::operators::jit;
-  TestLSTMKernel<jit::kLSTMCtHt, float, paddle::platform::CPUPlace>();
-  TestLSTMKernel<jit::kLSTMCtHt, double, paddle::platform::CPUPlace>();
+  TestLSTMKernel<jit::kLSTMCtHt, float, CPUPlace>();
+  TestLSTMKernel<jit::kLSTMCtHt, double, CPUPlace>();
 }
 
 TEST(JITKernel, kLSTMC1H1) {
-  namespace jit = paddle::operators::jit;
-  TestLSTMKernel<jit::kLSTMC1H1, float, paddle::platform::CPUPlace>();
-  TestLSTMKernel<jit::kLSTMC1H1, double, paddle::platform::CPUPlace>();
+  TestLSTMKernel<jit::kLSTMC1H1, float, CPUPlace>();
+  TestLSTMKernel<jit::kLSTMC1H1, double, CPUPlace>();
 }
 
 // GRU
 TEST(JITKernel, kGRUH1) {
-  namespace jit = paddle::operators::jit;
-  TestGRUKernel<jit::kGRUH1, float, paddle::platform::CPUPlace>();
-  TestGRUKernel<jit::kGRUH1, double, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUH1, float, CPUPlace>();
+  TestGRUKernel<jit::kGRUH1, double, CPUPlace>();
 }
 
 TEST(JITKernel, kGRUHtPart1) {
-  namespace jit = paddle::operators::jit;
-  TestGRUKernel<jit::kGRUHtPart1, float, paddle::platform::CPUPlace>();
-  TestGRUKernel<jit::kGRUHtPart1, double, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart1, float, CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart1, double, CPUPlace>();
 }
 
 TEST(JITKernel, kGRUHtPart2) {
-  namespace jit = paddle::operators::jit;
-  TestGRUKernel<jit::kGRUHtPart2, float, paddle::platform::CPUPlace>();
-  TestGRUKernel<jit::kGRUHtPart2, double, paddle::platform::CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart2, float, CPUPlace>();
+  TestGRUKernel<jit::kGRUHtPart2, double, CPUPlace>();
 }
 
 TEST(JITKernel, kSeqPool) {
-  namespace jit = paddle::operators::jit;
-  TestSeqPoolKernel<jit::kSeqPool, float, paddle::platform::CPUPlace>();
-  TestSeqPoolKernel<jit::kSeqPool, double, paddle::platform::CPUPlace>();
+  TestSeqPoolKernel<jit::kSeqPool, float, CPUPlace>();
+  TestSeqPoolKernel<jit::kSeqPool, double, CPUPlace>();
 }
 
 TEST(JITKernel, kMatMul) {
-  namespace jit = paddle::operators::jit;
-  TestMatMulKernel<jit::kMatMul, float, paddle::platform::CPUPlace>();
-  TestMatMulKernel<jit::kMatMul, double, paddle::platform::CPUPlace>();
+  TestMatMulKernel<jit::kMatMul, float, CPUPlace>();
+  TestMatMulKernel<jit::kMatMul, double, CPUPlace>();
+}
+
+TEST(JITKernel, kSoftmax) {
+  TestSoftmaxKernel<jit::kSoftmax, float, CPUPlace>();
+  TestSoftmaxKernel<jit::kSoftmax, double, CPUPlace>();
 }
 
 TEST(JITKernel, kNCHW16CMulNC) {
-  namespace jit = paddle::operators::jit;
-  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float,
-                         paddle::platform::CPUPlace>();
-  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double,
-                         paddle::platform::CPUPlace>();
+  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, CPUPlace>();
+  TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
 }
 
 // TODO(yihua/TJ): add crf decoding and layer norm unit tests
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 6bbb7155dda9b2c844f793a63adb861c2ed956e8..e20524012a5839fd250b7426a5efc42b7e87fe87 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -53,7 +53,7 @@ math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
-math_library(softmax DEPS math_function)
+math_library(softmax DEPS math_function jit_kernel_helper)
 math_library(beam_search DEPS math_function)
 
 math_library(matrix_bit_code)
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 1d9d98b10646af9e199f6c481740d30745888707..1ff9ff684fc8001afb0f768a033b4c5bd1592702 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/jit/kernels.h"
 
-#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -81,28 +81,10 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
     const int kBatchDim = 0;
     const int kClassDim = 1;
     // 2D data. Batch x C
-    const int batch_size = in_dims[kBatchDim];
-    const int num_classes = in_dims[kClassDim];
-    std::vector<float> entities(batch_size);
-    auto blas = math::GetBlas<DeviceContext, float>(context);
-    for (int n = 0; n < batch_size; ++n) {
-      entities[n] = in_data[n * num_classes];
-      for (int c = 1; c < num_classes; ++c) {
-        entities[n] = in_data[n * num_classes + c] > entities[n]
-                          ? in_data[n * num_classes + c]
-                          : entities[n];
-      }
-      for (int c = 0; c < num_classes; ++c) {
-        out_data[n * num_classes + c] =
-            in_data[n * num_classes + c] - entities[n];
-      }
-    }
-
-    blas.VEXP(num_classes * batch_size, out_data, out_data);
-    for (int n = 0; n < batch_size; ++n) {
-      auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);
-      blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);
-    }
+    auto compute_softmax =
+        jit::Get<jit::kSoftmax, jit::SoftmaxTuples<float>, platform::CPUPlace>(
+            in_dims[kClassDim]);
+    compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
   }
 };
 
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index d0619293acf2d2df0d925e969bdeb8e45cda6e2b..a260cda49138580b209e647af459e9392d9f18f1 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -70,6 +70,8 @@ extern void* mklml_dso_handle;
   __macro(cblas_ddot);              \
   __macro(cblas_sasum);             \
   __macro(cblas_dasum);             \
+  __macro(cblas_isamax);            \
+  __macro(cblas_idamax);            \
   __macro(cblas_sscal);             \
   __macro(cblas_dscal);             \
   __macro(vsAdd);                   \