diff --git a/paddle/fluid/operators/fusion_gru_op.cc b/paddle/fluid/operators/fusion_gru_op.cc
index 3a34aa86b6331e4fe2813eea97cb6644323807c3..fcd551ed3b00cc7c3fbbc2c6e7f9fd071d3228a4 100644
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
@@ -13,16 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fusion_gru_op.h"
+#include <cstring>  // for memcpy
 #include <string>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
-#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
-#include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -35,12 +32,12 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                  "Input(WeightH) of GRU should not be null.");
 
   PADDLE_ENFORCE(ctx->HasOutput("XX"), "Output(XX) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
-                 "Output(BatchedGate) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
-                 "Output(BatchResetHiddenPrev) of GRU should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
-                 "Output(BatchedHidden) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                 "Output(ReorderedH0) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                 "Output(BatchedInput) of GRU should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedOut"),
+                 "Output(BatchedOut) of GRU should not be null.");
   PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
                  "Output(Hidden) of GRU should not be null.");
 
@@ -83,9 +80,8 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   }
   framework::DDim out_dims({x_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
-  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
-  ctx->SetOutputDim("BatchedHidden", out_dims);
-  ctx->SetOutputDim("BatchResetHiddenPrev", out_dims);
+  ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchedOut", out_dims);
   ctx->ShareLoD("X", "Hidden");
 
   int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
@@ -115,22 +111,26 @@ void FusionGRUOpMaker::Make() {
            "(Tensor) The FC weight with shape (M x 3D),"
            "where M is the dim size of x, D is the hidden size. ");
   AddInput("WeightH",
-           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. ");
+           "(Tensor) (D x 3D) Same as GRUOp, where D is the hidden size. "
+           "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}"
+           "Acutally they are D x 2D and D x D two part weights."
+           "{W_update, W_reset; W_state}"
+           "{D x (D + D); D x D}");
   AddInput("Bias",
            "(Tensor, optional) (1 x 3D)."
            "Almost same as GRUOp."
            "Note: if have FC bias it should be added on this bias.")
       .AsDispensable();
+  AddOutput("ReorderedH0", "(Tensor) (N x D), which N is the min-batch size.")
+      .AsIntermediate();
   AddOutput("XX",
-            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            "(LoDTensor) the result after X * WeightX (size is T x 3D)"
             " or batched_X (size is T x M), this will be automatically chosen,"
             " where T is the total time steps in this mini-batch,"
             " D is the hidden size, M is the dim size of x input.")
       .AsIntermediate();
-  AddOutput("BatchedGate", "(LoDTensor) Same as GRUOp").AsIntermediate();
-  AddOutput("BatchResetHiddenPrev", "(LoDTensor) (T x 3D) Same as GRUOp.")
-      .AsIntermediate();
-  AddOutput("BatchedHidden", "(LoDTensor) (T X D) Same as GRUOp.")
+  AddOutput("BatchedInput", "(LoDTensor) (T x 3D)").AsIntermediate();
+  AddOutput("BatchedOut", "(LoDTensor) (T X D) save batched hidden.")
       .AsIntermediate();
   AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp");
   AddAttr<std::string>("activation",
@@ -153,45 +153,53 @@ more details can refer to GRU op.
 )DOC");
 }
 
-template <typename DeviceContext, typename T>
-inline void ReorderInitState(const DeviceContext& ctx,
-                             const framework::Tensor& src,
-                             framework::Vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
-  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
-  row_shuffle(ctx, src, index_lod, dst, indexed_src);
-}
-
-template <typename DeviceContext, typename T>
+template <typename T>
 class FusionGRUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
     auto* x = ctx.Input<LoDTensor>("X");
     auto* wx = ctx.Input<Tensor>("WeightX");
     auto* wh = ctx.Input<Tensor>("WeightH");
     auto* bias = ctx.Input<Tensor>("Bias");
     auto* h0 = ctx.Input<Tensor>("H0");
 
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
     auto* xx = ctx.Output<LoDTensor>("XX");
-    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
-    auto* batch_reset_hidden_prev =
-        ctx.Output<LoDTensor>("BatchResetHiddenPrev");
-    auto* batch_hidden = ctx.Output<LoDTensor>("BatchedHidden");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_out = ctx.Output<LoDTensor>("BatchedOut");
     auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
-    bool is_reverse = ctx.Attr<bool>("is_reverse");
 
-    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
-    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
-    batch_reset_hidden_prev->mutable_data<T>(ctx.GetPlace());
-    batch_hidden->mutable_data<T>(ctx.GetPlace());
-    hidden_out->mutable_data<T>(ctx.GetPlace());
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    std::function<void(const int, const T *, T *)> act_gate, act_state;
+    std::function<void(const int, const T, const T*, T*)> bias_sub;
+    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
+    auto& act_state_str = ctx.Attr<std::string>("activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_state = act_functor(act_state_str);
+      bias_sub = math::vec_bias_sub<T, platform::jit::avx>;
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_state = act_functor(act_state_str);
+      bias_sub = math::vec_bias_sub<T, platform::jit::isa_any>;
+    }
 
     const T* x_data = x->data<T>();
     const T* wx_data = wx->data<T>();
     const T* wh_data = wh->data<T>();
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_input_data = batched_input->mutable_data<T>(ctx.GetPlace());
+    T* batched_out_data = batched_out->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+
     auto x_dims = x->dims();
     auto wx_dims = wx->dims();
+    const int D3 = wx_dims[1];
+    const int D = D3 / 3;
+    const int D2 = D * 2;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
@@ -199,125 +207,110 @@ class FusionGRUKernel : public framework::OpKernel<T> {
       math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
                                         x_data, wx_data, xx_data,
                                         bias ? bias->data<T>() : NULL);
-      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+      to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
     } else {
       to_batch(dev_ctx, *x, xx, true, is_reverse);
-      batched_gate->set_lod(xx->lod());
+      batched_input->set_lod(xx->lod());
       math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
-                                        xx_data, wx_data, batched_gate_data,
+                                        xx_data, wx_data, batched_input_data,
                                         bias ? bias->data<T>() : NULL);
     }
 
-    int frame_size = static_cast<int>(wx_dims[1] / 3);
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(wh_data);
-    gru_value.state_weight =
-        const_cast<T*>(wh_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-
-    framework::Vector<size_t> order(batched_gate->lod()[2]);
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
 
+    int tstart = 0;
+    T* prev_hidden_data = NULL;
     if (h0) {
-      ReorderInitState<DeviceContext, T>(
-          ctx.template device_context<DeviceContext>(), *h0, order, &ordered_h0,
-          true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
+      // reorder h0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(ctx.GetPlace());
+      const T* h0_data = h0->data<T>();
+      prev_hidden_data = reordered_h0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+      }
     } else {
-      gru_value.prev_out_value = nullptr;
+      // compute without h0
+      T* cur_in_data = batched_input_data;
+      T* cur_out_data = batched_out_data;
+      // W: {W_update, W_reset; W_state}
+      for (int i = 0; i < max_bs; ++i) {
+        // update gate
+        act_gate(D, cur_in_data, cur_in_data);
+        // state gate
+        act_state(D, cur_in_data + D2, cur_in_data + D2);
+        // out = a*b
+        blas.VMUL(D, cur_in_data, cur_in_data + D2, cur_out_data);
+        // add offset
+        cur_in_data += D3;
+        cur_out_data += D;
+      }
+      tstart = 1;
+      prev_hidden_data = batched_out_data;
     }
-    auto batch_starts = batched_gate->lod()[0];
-    size_t seq_len = batch_starts.size() - 1;
-    auto active_node =
-        math::detail::GetActivationType(ctx.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        ctx.Attr<std::string>("gate_activation"));
-
-#ifdef PADDLE_WITH_MKLML
-    // use MKL packed to speedup GEMM
-    if (FLAGS_paddle_num_threads >= 4) {
-      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                       frame_size * 2 /*width of weight*/,
-                                       frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_gate);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
-                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
-                     packed_gate);
-      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
-                                        frame_size /*width of weight*/,
-                                        frame_size /*height of height*/);
-      PADDLE_ENFORCE(packed_state);
-      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
-                     frame_size, T(1.0), gru_value.state_weight, frame_size,
-                     packed_state);
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
-              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
-              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
-        }
-
-        math::detail::forward_reset_output(
-            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_gate);
-
-        if (gru_value.prev_out_value) {
-          blas.GEMM_COMPUTE(
-              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
-              gru_value.reset_output_value, frame_size, packed_state,
-              frame_size, T(1), gru_value.gate_value + frame_size * 2,
-              frame_size * 3);
-        }
-
-        math::detail::forward_final_output(
-            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
-            cur_batch_size, active_node);
-
-        gru_value.prev_out_value = gru_value.output_value;
+    // Then start from next
+    const T* wh_state_data = wh_data + D * D2;
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    batched_input_data = batched_input_data + tstart * max_bs * D3;
+    batched_out_data = batched_out_data + tstart * max_bs * D;
+    for (int step = tstart; step < max_seq_len; ++step) {
+      const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+      // gemm prev * (Wu + Wr)
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D2, D, static_cast<T>(1),
+                prev_hidden_data, D, wh_data, D2, static_cast<T>(1),
+                batched_input_data, D3);
+
+      T* cur_batched_data = batched_input_data;
+      T* cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        act_gate(D2, cur_batched_data, cur_batched_data);
+        // rt = rt*ht_1 inplace result
+        // TODO(TJ): try to save to cur out data
+        // maybe get benifits avoiding cache miss in next gemm
+        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data + D,
+                  cur_batched_data + D);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
       }
 
-      blas.GEMM_FREE(packed_gate);
-      blas.GEMM_FREE(packed_state);
-    } else {
-#endif
-      for (size_t n = 0; n < seq_len; n++) {
-        int bstart = static_cast<int>(batch_starts[n]);
-        int bend = static_cast<int>(batch_starts[n + 1]);
-        int cur_batch_size = bend - bstart;
-
-        Tensor gate_t = batched_gate->Slice(bstart, bend);
-        Tensor reset_hidden_prev_t =
-            batch_reset_hidden_prev->Slice(bstart, bend);
-        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-        gru_value.output_value = hidden_t.data<T>();
-        gru_value.gate_value = gate_t.data<T>();
-        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-
-        math::GRUUnitFunctor<DeviceContext, T>::compute(
-            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-            active_gate);
-
-        gru_value.prev_out_value = gru_value.output_value;
+      cur_batched_data = batched_input_data;
+      blas.GEMM(CblasNoTrans, CblasNoTrans, cur_bs, D, D, static_cast<T>(1),
+                cur_batched_data + D, D3, wh_state_data, D, static_cast<T>(1),
+                cur_batched_data + D2, D3);
+
+      T* cur_out_data = batched_out_data;
+      cur_prev_hidden_data = prev_hidden_data;
+      for (int i = 0; i < cur_bs; ++i) {
+        // ht~ = act_state(...)
+        act_state(D, cur_batched_data + D2, cur_batched_data + D2);
+        // ht~~ = zt*ht~ inplace result
+        blas.VMUL(D, cur_batched_data, cur_batched_data + D2,
+                  cur_batched_data + D2);
+        // zt = 1 - zt inplace result
+        bias_sub(D, static_cast<T>(1), cur_batched_data, cur_batched_data);
+        // zt = ht_1 * zt
+        blas.VMUL(D, cur_prev_hidden_data, cur_batched_data, cur_batched_data);
+        // out = zt + ht~~
+        blas.VADD(D, cur_batched_data, cur_batched_data + D2, cur_out_data);
+
+        cur_batched_data += D3;
+        cur_prev_hidden_data += D;
+        cur_out_data += D;
       }
-#ifdef PADDLE_WITH_MKLML
+      prev_hidden_data = batched_out_data;
+      batched_out_data = cur_out_data;
+      batched_input_data = cur_batched_data;
     }
-#endif
+
     math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batched_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden_out);
+    batched_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_out, hidden_out);
   }
 };
 
@@ -327,6 +320,5 @@ class FusionGRUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OP_CPU_KERNEL(
-    fusion_gru, ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FusionGRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel<float>,
+                       ops::FusionGRUKernel<double>);
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 07372235a7c23832e528c3e852a4747f4244b833..a3186f82d0c0cc6c9585735ddf7e9bb4db7126cb 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -92,7 +92,7 @@ class LoDTensor2BatchFunctor {
     // Calculate the start position of each batch.
     // example:  sequences = {s0, s1, s2}
     //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
+    //           max_seqlen = 5,
     //           batchIndex = {b0, b1, b2, b3, b4}
     //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
     //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
@@ -109,7 +109,7 @@ class LoDTensor2BatchFunctor {
     //               where 1 is the second sequence,
     //                     0 is the first sequence,
     //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
+    // The max_seqlen represents batch size after rearranging the
     // input LodTensor. It is also the maximum length of input sequence.
 
     paddle::framework::LoD batch_lods;
@@ -118,8 +118,8 @@ class LoDTensor2BatchFunctor {
     batch_lods.emplace_back(std::vector<size_t>{0});
 
     // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    int max_seqlen = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(max_seqlen + 1));
     // batch_lods[1] is the raw index in the input LoDTensor
     batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
     // batch_lods[2] is the sort order for the input LoDTensor.
@@ -128,7 +128,7 @@ class LoDTensor2BatchFunctor {
     size_t* batch_starts = batch_lods[0].data();
     size_t* seq2batch_idx = batch_lods[1].data();
     batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
+    for (int n = 0; n < max_seqlen; n++) {
       auto batch_id = static_cast<int>(batch_starts[n]);
       for (size_t i = 0; i < seq_info.size(); ++i) {
         int seq_len = seq_info[i].length;