add x86 math lstm and selected_rows test=develop (#1991)

add math function: lstm and selected_rows into lite/x86/math add selected_rows and rw_lock into lite/fluid add lstm_cpu_kernel and lstm_kernel into lite/x86/detail

add x86 math lstm and selected_rows test=develop (#1991)
add math function: lstm and selected_rows into lite/x86/math add selected_rows and rw_lock into lite/fluid add lstm_cpu_kernel and lstm_kernel into lite/x86/detail
dc27e9ff · huzhiqiang · GitHub · 9851c008 · dc27e9ff · dc27e9ff
12 changed file
--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
@@ -30,13 +30,13 @@ math_library(sample_prob)
 math_library(sampler)
 math_library(gru_compute DEPS activation_functions math_function)
-## math_library(lstm_compute DEPS activation_functions)
+math_library(lstm_compute DEPS activation_functions)
 lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3)
 math_library(math_function DEPS blas)
 math_library(maxouting)
 math_library(pooling)
-# math_library(selected_rows_functor DEPS selected_rows math_function blas)
+math_library(selected_rows_functor DEPS selected_rows math_function blas)
 math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)

--- a/lite/backends/x86/math/detail/lstm_cpu_kernel.h
+++ b/lite/backends/x86/math/detail/lstm_cpu_kernel.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <type_traits>
+#include "lite/backends/x86/math/detail/activation_functions.h"
+#include "lite/backends/x86/math/lstm_compute.h"
+#if defined(_WIN32)
+#if defined(__AVX2__) || defined(__AVX__)
+inline __m256 operator+=(__m256 a, __m256 b) { return _mm256_add_ps(a, b); }
+#endif
+#endif
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+namespace detail {
+#ifndef __NVCC__
+template <class T, class Op>
+void naive_lstm_forward_one_sequence(Op op,
+                                     LstmMetaValue<T> value,
+                                     int frame_size,
+                                     T cell_clip,
+                                     ActivationType active_node,
+                                     ActivationType active_gate,
+                                     ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    value.state_value[i] = r_state;
+    value.state_active_value[i] = r_state_atv;
+    value.output_value[i] = r_out;
+  }
+}
+template <class T, class Op>
+void naive_lstm_backward_one_sequence(Op op,
+                                      LstmMetaValue<T> value,
+                                      LstmMetaGrad<T> grad,
+                                      int frame_size,
+                                      T cell_clip,
+                                      ActivationType active_node,
+                                      ActivationType active_gate,
+                                      ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+  T *grad_in = grad.gate_grad;
+  T *grad_ig = grad.gate_grad + frame_size;
+  T *grad_fg = grad.gate_grad + frame_size * 2;
+  T *grad_og = grad.gate_grad + frame_size * 3;
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+    r_state = value.state_value[i];
+    r_state_atv = value.state_active_value[i];
+    r_output_grad = grad.output_grad[i];
+    r_state_grad = grad.state_grad[i];
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    grad.state_grad[i] = r_state_grad;
+    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
+  }
+}
+template <class T, class Op>
+void avx_lstm_forward_one_sequence(Op op,
+                                   LstmMetaValue<T> value,
+                                   int frame_size,
+                                   T cell_clip,
+                                   ActivationType active_node,
+                                   ActivationType active_gate,
+                                   ActivationType active_state) {
+#ifdef __AVX__
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_state;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_state_atv;
+  __m256 r_out;
+  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
+  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
+  __m256 *value_fg =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
+  __m256 *value_og =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
+      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
+      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
+    }
+    if (value.prev_state_value) {
+      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
+    }
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_prev_state,
+       &r_state,
+       &r_state_atv,
+       &r_out,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    (reinterpret_cast<__m256 *>(value.state_value))[i] = r_state;
+    (reinterpret_cast<__m256 *>(value.state_active_value))[i] = r_state_atv;
+    (reinterpret_cast<__m256 *>(value.output_value))[i] = r_out;
+  }
+#endif
+}
+template <class T, class Op>
+void avx_lstm_backward_one_sequence(Op op,
+                                    LstmMetaValue<T> value,
+                                    LstmMetaGrad<T> grad,
+                                    int frame_size,
+                                    T cell_clip,
+                                    ActivationType active_node,
+                                    ActivationType active_gate,
+                                    ActivationType active_state) {
+#ifdef __AVX__
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_grad_in;
+  __m256 r_grad_ig;
+  __m256 r_grad_fg;
+  __m256 r_grad_og;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_prev_state_grad;
+  __m256 r_state_grad;
+  __m256 r_state;
+  __m256 r_state_atv;
+  __m256 r_output_grad;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_checkIGrad;
+  __m256 r_checkFGrad;
+  __m256 r_checkOGrad;
+  __m256 *value_in = reinterpret_cast<__m256 *>(value.gate_value);
+  __m256 *value_ig = reinterpret_cast<__m256 *>(value.gate_value + frame_size);
+  __m256 *value_fg =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 2);
+  __m256 *value_og =
+      reinterpret_cast<__m256 *>(value.gate_value + frame_size * 3);
+  __m256 *grad_in = reinterpret_cast<__m256 *>(grad.gate_grad);
+  __m256 *grad_ig = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size);
+  __m256 *grad_fg = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 2);
+  __m256 *grad_og = reinterpret_cast<__m256 *>(grad.gate_grad + frame_size * 3);
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = (reinterpret_cast<__m256 *>(value.check_ig))[i];
+      r_checkF = (reinterpret_cast<__m256 *>(value.check_fg))[i];
+      r_checkO = (reinterpret_cast<__m256 *>(value.check_og))[i];
+    }
+    r_state = (reinterpret_cast<__m256 *>(value.state_value))[i];
+    r_state_atv = (reinterpret_cast<__m256 *>(value.state_active_value))[i];
+    r_output_grad = (reinterpret_cast<__m256 *>(grad.output_grad))[i];
+    r_state_grad = (reinterpret_cast<__m256 *>(grad.state_grad))[i];
+    if (value.prev_state_value) {
+      r_prev_state = (reinterpret_cast<__m256 *>(value.prev_state_value))[i];
+    }
+    op(&r_value_in,
+       &r_value_ig,
+       &r_value_fg,
+       &r_value_og,
+       &r_grad_in,
+       &r_grad_ig,
+       &r_grad_fg,
+       &r_grad_og,
+       &r_prev_state,
+       &r_prev_state_grad,
+       &r_state,
+       &r_state_grad,
+       &r_state_atv,
+       &r_output_grad,
+       &r_checkI,
+       &r_checkF,
+       &r_checkO,
+       &r_checkIGrad,
+       &r_checkFGrad,
+       &r_checkOGrad,
+       &cell_clip,
+       active_node,
+       active_gate,
+       active_state);
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    (reinterpret_cast<__m256 *>(grad.state_grad))[i] = r_state_grad;
+    if (grad.prev_state_grad)
+      (reinterpret_cast<__m256 *>(grad.prev_state_grad))[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad)
+        (reinterpret_cast<__m256 *>(grad.check_ig_grad))[i] += r_checkIGrad;
+      if (grad.check_fg_grad)
+        (reinterpret_cast<__m256 *>(grad.check_fg_grad))[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad)
+      (reinterpret_cast<__m256 *>(grad.check_og_grad))[i] += r_checkOGrad;
+  }
+#endif
+}
+template <class T, class Op>
+void cpu_lstm_forward(Op op,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      T cell_clip,
+                      ActivationType active_node,
+                      ActivationType active_gate,
+                      ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_forward_one_sequence<T>(op,
+                                     value,
+                                     frame_size,
+                                     cell_clip,
+                                     active_node,
+                                     active_gate,
+                                     active_state);
+  } else {
+    naive_lstm_forward_one_sequence<T>(op,
+                                       value,
+                                       frame_size,
+                                       cell_clip,
+                                       active_node,
+                                       active_gate,
+                                       active_state);
+  }
+}
+template <class T, class Op>
+void cpu_lstm_backward(Op op,
+                       LstmMetaValue<T> value,
+                       LstmMetaGrad<T> grad,
+                       int frame_size,
+                       T cell_clip,
+                       ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_backward_one_sequence<T>(op,
+                                      value,
+                                      grad,
+                                      frame_size,
+                                      cell_clip,
+                                      active_node,
+                                      active_gate,
+                                      active_state);
+  } else {
+    naive_lstm_backward_one_sequence<T>(op,
+                                        value,
+                                        grad,
+                                        frame_size,
+                                        cell_clip,
+                                        active_node,
+                                        active_gate,
+                                        active_state);
+  }
+}
+#endif
+}  // namespace detail
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/detail/lstm_kernel.h
+++ b/lite/backends/x86/math/detail/lstm_kernel.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <type_traits>
+#include "lite/backends/x86/math/detail/activation_functions.h"
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+namespace detail {
+namespace forward {
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *prev_state,
+                             T *state,
+                             T *state_atv,
+                             T *output,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *value_in = activation(*value_in, active_node);
+    *value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
+    *value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
+    *state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
+    if (*cell_clip > 0.0) {
+      if (*state < -1.0 * (*cell_clip)) {
+        *state = -1.0 * (*cell_clip);
+      }
+      if (*state > *cell_clip) {
+        *state = *cell_clip;
+      }
+    }
+    *value_og = activation(*value_og + (*state) * (*checkO), active_gate);
+    *state_atv = activation(*state, active_state);
+    *output = (*value_og) * (*state_atv);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *prev_state,
+                             __m256 *state,
+                             __m256 *state_atv,
+                             __m256 *output,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *value_in = activation(*value_in, active_node);
+    *value_ig = activation(
+        _mm256_add_ps(*value_ig, _mm256_mul_ps(*prev_state, *checkI)),
+        active_gate);
+    *value_fg = activation(
+        _mm256_add_ps(*value_fg, _mm256_mul_ps(*prev_state, *checkF)),
+        active_gate);
+    *state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
+                           _mm256_mul_ps(*prev_state, *value_fg));
+    if (*cell_clip > 0.0f) {
+      __m256 min = _mm256_set1_ps(0.0f - *cell_clip);
+      __m256 max = _mm256_set1_ps(*cell_clip);
+      *state = _mm256_min_ps(max, *state);
+      *state = _mm256_max_ps(min, *state);
+    }
+    *value_og = activation(
+        _mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
+    *state_atv = activation(*state, active_state);
+    *output = _mm256_mul_ps(*value_og, *state_atv);
+  }
+#endif
+#endif
+};
+}  // namespace forward
+namespace backward {
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T *value_in,
+                             T *value_ig,
+                             T *value_fg,
+                             T *value_og,
+                             T *grad_in,
+                             T *grad_ig,
+                             T *grad_fg,
+                             T *grad_og,
+                             T *prev_state,
+                             T *prev_state_grad,
+                             T *state,
+                             T *state_grad,
+                             T *state_atv,
+                             T *output_grad,
+                             T *checkI,
+                             T *checkF,
+                             T *checkO,
+                             T *checkIGrad,
+                             T *checkFGrad,
+                             T *checkOGrad,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *grad_og =
+        activation((*output_grad) * (*state_atv), *value_og, active_gate);
+    if (*cell_clip > 0.0f) {
+      if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
+        *state_grad = 0.0f;
+      } else {
+        *state_grad +=
+            activation((*output_grad) * (*value_og), *state_atv, active_state) +
+            (*grad_og) * (*checkO);
+      }
+    } else {
+      *state_grad +=
+          activation((*output_grad) * (*value_og), *state_atv, active_state) +
+          (*grad_og) * (*checkO);
+    }
+    *grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
+    *grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
+    *grad_fg =
+        activation((*state_grad) * (*prev_state), *value_fg, active_gate);
+    *prev_state_grad = (*grad_ig) * (*checkI) + (*grad_fg) * (*checkF) +
+                       (*state_grad) * (*value_fg);
+    *checkIGrad = (*grad_ig) * (*prev_state);
+    *checkFGrad = (*grad_fg) * (*prev_state);
+    *checkOGrad = (*grad_og) * (*state);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+  HOSTDEVICE void operator()(__m256 *value_in,
+                             __m256 *value_ig,
+                             __m256 *value_fg,
+                             __m256 *value_og,
+                             __m256 *grad_in,
+                             __m256 *grad_ig,
+                             __m256 *grad_fg,
+                             __m256 *grad_og,
+                             __m256 *prev_state,
+                             __m256 *prev_state_grad,
+                             __m256 *state,
+                             __m256 *state_grad,
+                             __m256 *state_atv,
+                             __m256 *output_grad,
+                             __m256 *checkI,
+                             __m256 *checkF,
+                             __m256 *checkO,
+                             __m256 *checkIGrad,
+                             __m256 *checkFGrad,
+                             __m256 *checkOGrad,
+                             T *cell_clip,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    *grad_og = activation(
+        _mm256_mul_ps(*output_grad, *state_atv), *value_og, active_gate);
+    if (*cell_clip > 0.0f) {
+      T *state_ = reinterpret_cast<T *>(state);
+      if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
+        *state_grad = _mm256_set1_ps(0.0f);
+      } else {
+        *state_grad =
+            _mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
+                                     *state_atv,
+                                     active_state),
+                          *state_grad);
+        *state_grad =
+            _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
+      }
+    }
+    *grad_in = activation(
+        _mm256_mul_ps(*state_grad, *value_ig), *value_in, active_node);
+    *grad_ig = activation(
+        _mm256_mul_ps(*state_grad, *value_in), *value_ig, active_gate);
+    *grad_fg = activation(
+        _mm256_mul_ps(*state_grad, *prev_state), *value_fg, active_gate);
+    *prev_state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_ig, *checkI),
+                                     _mm256_mul_ps(*grad_fg, *checkF));
+    *prev_state_grad =
+        _mm256_add_ps(_mm256_mul_ps(*state_grad, *value_fg), *prev_state_grad);
+    *checkIGrad = _mm256_mul_ps(*grad_ig, *prev_state);
+    *checkFGrad = _mm256_mul_ps(*grad_fg, *prev_state);
+    *checkOGrad = _mm256_mul_ps(*grad_og, *state);
+  }
+#endif
+#endif
+};
+}  // namespace backward
+}  // namespace detail
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/lstm_compute.cc
+++ b/lite/backends/x86/math/lstm_compute.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "lite/backends/x86/math/lstm_compute.h"
+#include "lite/backends/x86/math/detail/lstm_cpu_kernel.h"
+#include "lite/backends/x86/math/detail/lstm_kernel.h"
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <class T>
+struct LstmUnitFunctor<lite::TargetType::kX86, T> {
+  static void compute(const lite::X86Context& context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_forward(detail::forward::lstm<T>(),
+                               value,
+                               frame_size,
+                               cell_clip,
+                               cand_act,
+                               gate_act,
+                               cell_act);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+template <class T>
+struct LstmUnitGradFunctor<lite::TargetType::kX86, T> {
+  static void compute(const lite::X86Context& context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_backward(detail::backward::lstm<T>(),
+                                value,
+                                grad,
+                                frame_size,
+                                cell_clip,
+                                cand_act,
+                                gate_act,
+                                cell_act);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
+      }
+    }
+  }
+};
+template class LstmUnitFunctor<lite::TargetType::kX86, float>;
+template class LstmUnitFunctor<lite::TargetType::kX86, double>;
+template class LstmUnitGradFunctor<lite::TargetType::kX86, float>;
+template class LstmUnitGradFunctor<lite::TargetType::kX86, double>;
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/lstm_compute.h
+++ b/lite/backends/x86/math/lstm_compute.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "lite/backends/x86/math/detail/activation_functions.h"
+#include "lite/core/context.h"
+#include "lite/utils/paddle_enforce.h"
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <class T>
+struct LstmMetaValue {
+  T *gate_value;
+  T *prev_state_value;
+  T *state_value;
+  T *state_active_value;
+  T *output_value;
+  T *check_ig;
+  T *check_fg;
+  T *check_og;
+};
+template <class T>
+struct LstmMetaGrad {
+  T *gate_grad;
+  T *prev_state_grad;
+  T *state_grad;
+  T *state_active_grad;
+  T *output_grad;
+  T *check_ig_grad;
+  T *check_fg_grad;
+  T *check_og_grad;
+};
+template <lite::TargetType Target, typename T>
+class LstmUnitFunctor {
+ public:
+  static void compute(const lite::Context<Target> &context,
+                      LstmMetaValue<T> value,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+template <lite::TargetType Target, typename T>
+class LstmUnitGradFunctor {
+ public:
+  static void compute(const lite::Context<Target> &context,
+                      LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad,
+                      int frame_size,
+                      int batch_size,
+                      T cell_clip,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <set>
+#include <unordered_map>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/math/selected_rows_functor.h"
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<lite::TargetType::kX86, T> {
+  void operator()(const lite::X86Context& context,
+                  const fluid::SelectedRows& input1,
+                  const fluid::SelectedRows& input2,
+                  fluid::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    auto* out_data = out_value->mutable_data<T>();
+    auto* in1_data = in1_value.data<T>();
+    std::copy_n(in1_data, in1_value.numel(), out_data);
+    auto* in2_data = in2_value.data<T>();
+    std::copy_n(in2_data, in2_value.numel(), out_data + in1_value.numel());
+  }
+};
+template struct SelectedRowsAdd<lite::TargetType::kX86, float>;
+template struct SelectedRowsAdd<lite::TargetType::kX86, double>;
+template <typename T>
+struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
+  void operator()(const lite::X86Context& context,
+                  const fluid::SelectedRows& input1,
+                  const lite::Tensor& input2,
+                  lite::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    SetConstant<lite::TargetType::kX86, T> functor;
+    functor(context, output, 0.0);
+    auto* in1_data = in1_value.data<T>();
+    auto* out_data = output->mutable_data<T>();
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        out_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+    auto out_eigen = fluid::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = fluid::EigenVector<T>::Flatten(input2);
+    out_eigen.device(lite::fluid::EigenDeviceType<TARGET(kX86)>()) =
+        out_eigen + in2_eigen;
+  }
+};
+template struct SelectedRowsAddTensor<lite::TargetType::kX86, float>;
+template struct SelectedRowsAddTensor<lite::TargetType::kX86, double>;
+template <typename T>
+struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
+  void operator()(const lite::X86Context& context,
+                  const fluid::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  fluid::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+    // concat rows
+    in2_rows.reserve(in2_rows.size() +
+                     size_t(in1_rows.end() - in1_rows.begin()));
+    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->mutable_data<T>();
+    std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
+  }
+};
+template struct SelectedRowsAddTo<lite::TargetType::kX86, float>;
+template struct SelectedRowsAddTo<lite::TargetType::kX86, double>;
+template struct SelectedRowsAddTo<lite::TargetType::kX86, int>;
+template struct SelectedRowsAddTo<lite::TargetType::kX86, int64_t>;
+template <typename T>
+struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
+  void operator()(const lite::X86Context& context,
+                  const std::vector<fluid::SelectedRows*>& input1,
+                  const std::vector<int64_t>& input2_offsets,
+                  fluid::SelectedRows* input2) {
+    // Ensure all selected rows have the same height
+    size_t size = 0u;
+    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
+      auto& in_rows = (*iter)->rows();
+      size += in_rows.end() - in_rows.begin();
+      auto in1_height = (*iter)->height();
+      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    }
+    // concat rows
+    std::vector<int64_t> in2_rows;
+    in2_rows.reserve(in2_rows.size() + size);
+    for (auto iter = input1.begin(); iter != input1.end(); ++iter) {
+      const std::vector<int64_t>& in_rows = (*iter)->rows();
+      in2_rows.insert(in2_rows.end(), in_rows.begin(), in_rows.end());
+    }
+    input2->set_rows(in2_rows);
+    auto* in2_value = input2->mutable_value();
+    T* in2_data = in2_value->mutable_data<T>();
+    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
+    size_t offset = 0u;
+    for (size_t i = 0u; i != input1.size(); ++i) {
+      auto& in_value = input1[i]->value();
+      const T* in_data = in_value.data<T>();
+      offset += input2_offsets[i];
+      blas.VCOPY(in_value.numel(), in_data, in2_data + offset);
+    }
+  }
+};
+template struct SelectedRowsSumTo<lite::TargetType::kX86, float>;
+template struct SelectedRowsSumTo<lite::TargetType::kX86, double>;
+template <typename T>
+struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
+  void operator()(const lite::X86Context& context,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2) {
+    CHECK(input1.rows().size() != 0) << "input selected rows is empty!";
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->mutable_data<T>();
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+template struct SelectedRowsAddToTensor<lite::TargetType::kX86, float>;
+template struct SelectedRowsAddToTensor<lite::TargetType::kX86, double>;
+template struct SelectedRowsAddToTensor<lite::TargetType::kX86, int>;
+template struct SelectedRowsAddToTensor<lite::TargetType::kX86, int64_t>;
+// This is a separated namespace for manipulate SelectedRows typed
+// data. Like merge duplicated rows, adding two SelectedRows etc.
+//
+// Another group of functors is called "scatter updates", which means
+// use SelectedRows to update a dense tensor with different Ops, like
+// add or mul.
+namespace scatter {
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, lite::X86Context>::value>::type
+elementwise_add_to(const DeviceContext& ctx,
+                   BlasT<lite::TargetType::kX86, T>* blas,
+                   size_t data_len,
+                   const T* in,
+                   T* out) {
+  blas->AXPY(data_len, 1., in, out);
+}
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    !std::is_floating_point<T>::value &&
+    std::is_same<DeviceContext, lite::X86Context>::value>::type
+elementwise_add_to(const DeviceContext& ctx,
+                   BlasT<lite::TargetType::kX86, T>* blas,
+                   size_t data_len,
+                   const T* in,
+                   T* out) {
+  for (size_t i = 0; i < data_len; i++) {
+    out[i] += in[i];
+  }
+}
+template <typename T>
+struct MergeAdd<lite::TargetType::kX86, T> {
+  fluid::SelectedRows operator()(const lite::X86Context& context,
+                                 const fluid::SelectedRows& input,
+                                 const bool sorted_result = false) {
+    fluid::SelectedRows out;
+    (*this)(context, input, &out, sorted_result);
+    return out;
+  }
+  void operator()(const lite::X86Context& context,
+                  const fluid::SelectedRows& input,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false) {
+    std::vector<const fluid::SelectedRows*> inputs;
+    inputs.push_back(&input);
+    (*this)(context, inputs, output, sorted_result);
+  }
+  void operator()(const lite::X86Context& context,
+                  const std::vector<const fluid::SelectedRows*>& inputs,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false) {
+    if (inputs.size() == 0) {
+      VLOG(3) << "no input! return";
+      return;
+    }
+    const fluid::SelectedRows* has_value_input = nullptr;
+    for (auto* in : inputs) {
+      if (in->rows().size() > 0) {
+        has_value_input = in;
+        break;
+      }
+    }
+    if (has_value_input == nullptr) {
+      VLOG(3) << "no input has value! just return" << std::endl;
+      return;
+    }
+    auto input_width = has_value_input->value().dims()[1];
+    auto input_height = has_value_input->height();
+    fluid::SelectedRows& out = *output;
+    std::set<int64_t> merged_row_set;
+    size_t row_num = 0;
+    for (auto* input : inputs) {
+      if (input->rows().size() == 0) {
+        continue;
+      }
+      PADDLE_ENFORCE_EQ(input_width,
+                        input->value().dims()[1],
+                        "all input should have same "
+                        "dimension except for the first one");
+      PADDLE_ENFORCE_EQ(
+          input_height, input->height(), "all input should have same height");
+      row_num += input->rows().size();
+      merged_row_set.insert(input->rows().begin(), input->rows().end());
+    }
+    out.set_height(input_height);
+    lite::DDim dims(std::vector<int64_t>(
+        {static_cast<int64_t>(merged_row_set.size()), input_width}));
+    out.mutable_value()->Resize(dims);
+    auto* out_data = out.mutable_value()->mutable_data<T>();
+    if (merged_row_set.size() == row_num && !sorted_result) {
+      // no duplicated ids, just concat the result together
+      std::vector<int64_t> merge_rows;
+      merge_rows.reserve(row_num);
+      // concat rows
+      for (auto* in : inputs) {
+        merge_rows.insert(
+            merge_rows.end(), in->rows().begin(), in->rows().end());
+      }
+      out.set_rows(merge_rows);
+      int64_t copied_numel = 0;
+      for (auto* in : inputs) {
+        auto* in_data = in->value().data<T>();
+        auto in_numel = in->value().numel();
+        std::copy_n(in_data, in_numel, out_data + copied_numel);
+        copied_numel += in_numel;
+      }
+    } else {
+      std::vector<int64_t> merge_rows(merged_row_set.begin(),
+                                      merged_row_set.end());
+      if (sorted_result) {
+        std::sort(merge_rows.begin(), merge_rows.end());
+      }
+      out.set_rows(merge_rows);
+      math::SetConstant<lite::TargetType::kX86, T> constant_functor;
+      constant_functor(context, out.mutable_value(), 0.0);
+      std::unordered_map<int64_t, size_t> rows_to_id;
+      for (size_t i = 0; i < merge_rows.size(); ++i) {
+        rows_to_id[merge_rows[i]] = i;
+      }
+      auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
+      for (auto* input : inputs) {
+        if (input->rows().size() == 0) {
+          continue;
+        }
+        auto* input_data = input->value().data<T>();
+        auto& input_rows = input->rows();
+        for (size_t i = 0; i < input_rows.size(); i++) {
+          size_t out_i = rows_to_id[input_rows[i]];
+          elementwise_add_to<lite::X86Context, T>(
+              context,
+              &blas,
+              static_cast<size_t>(input_width),
+              &input_data[i * input_width],
+              &out_data[out_i * input_width]);
+        }
+      }
+    }
+  }
+};
+template struct MergeAdd<lite::TargetType::kX86, int>;
+template struct MergeAdd<lite::TargetType::kX86, int64_t>;
+template struct MergeAdd<lite::TargetType::kX86, float>;
+template struct MergeAdd<lite::TargetType::kX86, double>;
+template <typename T>
+struct UpdateToTensor<lite::TargetType::kX86, T> {
+  void operator()(const lite::X86Context& context,
+                  const ScatterOps& op,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+    // FIXME(typhoonzero): use macro fix the below messy code.
+    switch (op) {
+      case ScatterOps::ASSIGN:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::ADD:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUB:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] -=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUBBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] -
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+      case ScatterOps::MUL:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] *=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIV:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] /=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIVBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] /
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+    }
+  }
+};
+}  // namespace scatter
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/x86/math/selected_rows_functor.h
+++ b/lite/backends/x86/math/selected_rows_functor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/math/math_function.h"
+#include "lite/core/context.h"
+#include "lite/fluid/eigen.h"
+#include "lite/fluid/selected_rows.h"
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAdd {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  const fluid::SelectedRows& input2,
+                  fluid::SelectedRows* output);
+};
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAddTensor {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  const lite::Tensor& input2,
+                  lite::Tensor* output);
+};
+// input2 = input1 + input2
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAddTo {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  fluid::SelectedRows* input2);
+};
+// input2 = [all input in input1] + input2
+template <lite::TargetType Target, typename T>
+struct SelectedRowsSumTo {
+  void operator()(const lite::Context<Target>& context,
+                  const std::vector<fluid::SelectedRows*>& input1,
+                  const std::vector<int64_t>& input2_offsets,
+                  fluid::SelectedRows* input2);
+};
+// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic,
+// because it uses CudaAtomicAdd.
+// input2 = input1 + input2
+template <lite::TargetType Target, typename T>
+struct SelectedRowsAddToTensor {
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2);
+};
+namespace scatter {
+// functors for manuplating SelectedRows data
+template <lite::TargetType Target, typename T>
+struct MergeAdd {
+  // unary functor, merge by adding duplicated rows in
+  // the input SelectedRows object.
+  fluid::SelectedRows operator()(const lite::Context<Target>& context,
+                                 const fluid::SelectedRows& input,
+                                 const bool sorted_result = false);
+  void operator()(const lite::Context<Target>& context,
+                  const fluid::SelectedRows& input,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false);
+  void operator()(const lite::Context<Target>& context,
+                  const std::vector<const fluid::SelectedRows*>& inputs,
+                  fluid::SelectedRows* output,
+                  const bool sorted_result = false);
+};
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+// out = selected_rows_in / tensor
+template <lite::TargetType Target, typename T>
+struct UpdateToTensor {
+  void operator()(const lite::Context<Target>& context,
+                  const ScatterOps& op,
+                  const fluid::SelectedRows& input1,
+                  lite::Tensor* input2);
+};
+}  // namespace scatter
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
--- a/lite/fluid/CMakeLists.txt
+++ b/lite/fluid/CMakeLists.txt
 if (LITE_WITH_X86)
 lite_cc_library(fluid_data_type SRCS data_type.cc DEPS framework_proto eigen3)
-# lite_cc_library(selected_rows SRCS selected_rows.cc)
+lite_cc_library(selected_rows SRCS selected_rows.cc DEPS tensor model_parser)
 endif()
--- a/lite/fluid/rw_lock.h
+++ b/lite/fluid/rw_lock.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#if !defined(_WIN32)
+#include <pthread.h>
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32
+#include "lite/utils/paddle_enforce.h"
+namespace paddle {
+namespace lite {
+namespace fluid {
+#if !defined(_WIN32)
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+  inline void RDLock() {
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed");
+  }
+  inline void WRLock() {
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed");
+  }
+  inline void UNLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+  }
+ private:
+  pthread_rwlock_t lock_;
+};
+// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
+#else
+// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
+// In windows, rw_lock seems like a hack. Use empty object and do nothing.
+struct RWLock {
+  // FIXME(minqiyang): use mutex here to do fake lock
+  inline void RDLock() { mutex_.lock(); }
+  inline void WRLock() { mutex_.lock(); }
+  inline void UNLock() { mutex_.unlock(); }
+ private:
+  std::mutex mutex_;
+};
+#endif
+class AutoWRLock {
+ public:
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+  ~AutoWRLock() { UnLock(); }
+ private:
+  inline void Lock() { lock_->WRLock(); }
+  inline void UnLock() { lock_->UNLock(); }
+ private:
+  RWLock* lock_;
+};
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+  ~AutoRDLock() { UnLock(); }
+ private:
+  inline void Lock() { lock_->RDLock(); }
+  inline void UnLock() { lock_->UNLock(); }
+ private:
+  RWLock* lock_;
+};
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
--- a/lite/fluid/selected_rows.cc
+++ b/lite/fluid/selected_rows.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "lite/fluid/selected_rows.h"
+namespace paddle {
+namespace lite {
+namespace fluid {
+struct ReAllocateVisitor {
+  ReAllocateVisitor(const lite::DDim& dims, lite::Tensor* tensor)
+      : dims_(dims), tensor_(tensor) {}
+  template <typename T>
+  void operator()() const {
+    lite::Tensor cpu_tensor;
+    T* ptr = cpu_tensor.mutable_data<T>(lite::TargetType::kX86, dims_);
+    const T* old_ptr =
+        tensor_->memory_size() == 0 ? nullptr : tensor_->mutable_data<T>();
+    if (old_ptr != nullptr) {
+      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
+    }
+    tensor_->ShareDataWith(cpu_tensor);
+  }
+  lite::DDim dims_;
+  lite::Tensor* tensor_;
+};
+struct TensorCopyVisitor {
+  TensorCopyVisitor(lite::Tensor* dst,
+                    int64_t dst_offset,
+                    const lite::Tensor src,
+                    int64_t src_offset,
+                    int64_t size)
+      : dst_(dst),
+        dst_offset_(dst_offset),
+        src_(src),
+        src_offset_(src_offset),
+        size_(size) {}
+  template <typename T>
+  void apply() const {
+    // TODO(Yancey1989): support other place
+    std::copy_n(src_.data<T>() + src_offset_,
+                size_,
+                dst_->mutable_data<T>(lite::TargetType::kX86) + dst_offset_);
+  }
+  lite::Tensor* dst_;
+  int64_t dst_offset_;
+  lite::Tensor src_;
+  int64_t src_offset_;
+  int64_t size_;
+};
+struct TensorFillVisitor {
+  TensorFillVisitor(lite::Tensor* dst,
+                    int64_t dst_offset,
+                    int64_t size,
+                    float value)
+      : dst_(dst), dst_offset_(dst_offset), size_(size) {}
+  template <typename T>
+  void apply() const {
+    // TODO(qiao): support other place
+    //    paddle::platform::CPUPlace cpu;
+    auto* tensor_data = dst_->mutable_data<T>(lite::TargetType::kX86);
+    auto* start = tensor_data + dst_offset_;
+    auto* end = start + size_;
+    std::fill(start, end, static_cast<T>(0.0));
+  }
+  lite::Tensor* dst_;
+  int64_t dst_offset_;
+  int64_t size_;
+};
+void SerializeToStream(std::ostream& os,
+                       const SelectedRows& selected_rows,
+                       const lite::Context<lite::TargetType::kX86>& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  TensorToStream(os, selected_rows.value());
+}
+void DeserializeFromStream(
+    std::istream& is,
+    SelectedRows* selected_rows,
+    const lite::Context<lite::TargetType::kX86>& dev_ctx) {
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  TensorFromStream(is, selected_rows->mutable_value());
+}
+bool SelectedRows::HasKey(int64_t key) const {
+  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
+                                                                   : true;
+}
+int64_t SelectedRows::AutoGrownIndex(int64_t key,
+                                     bool auto_grown,
+                                     bool is_test) {
+  if (is_test) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size,
+          vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      int row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+void SelectedRows::Get(const lite::Tensor& ids,
+                       lite::Tensor* value,
+                       bool auto_grown,
+                       bool is_test) {
+  PADDLE_ENFORCE(value->IsInitialized(),
+                 "The value tensor should be initialized.");
+  if (ids.numel() == 0) {
+    VLOG(3) << "keys is empty, please check data!";
+  } else {
+    int64_t value_width = value_->numel() / value_->dims()[0];
+    PADDLE_ENFORCE_EQ(value_width,
+                      value->numel() / value->dims()[0],
+                      "output tensor should have the same shape with table "
+                      "except the dims[0].");
+    for (int i = 0; i < ids.numel(); ++i) {
+      auto id = ids.data<int64_t>()[i];
+      int64_t index = AutoGrownIndex(id, auto_grown, is_test);
+      if (index < 0) {
+        VLOG(5) << "id " << id << " not in the table, return 0";
+        TensorFillVisitor(value, i * value_width, value_width, 0.0)
+            .apply<float>();
+      } else {
+        TensorCopyVisitor(value,
+                          i * value_width,
+                          *value_.get(),
+                          index * value_width,
+                          value_width)
+            .apply<float>();
+      }
+    }
+  }
+}
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
--- a/lite/fluid/selected_rows.h
+++ b/lite/fluid/selected_rows.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/rw_lock.h"
+#include "lite/model_parser/model_parser.h"
+namespace paddle {
+namespace lite {
+namespace fluid {
+class SelectedRows {
+  /*
+   * @brief We can use the SelectedRows structure to reproduce a sparse table.
+   *  A sparse table is a key-value structure that the key is an `int64_t`,
+   *  and the value is a Tensor which the first dimension is 0.
+   *  You can use the following interface to operate the sparse table, and you
+   * can find
+   *  some detail information from the comments of each interface:
+   *
+   *  HasKey(key), whether the sparse table has the specified key.
+   *  Set(key, value), set a key-value pair into the sparse table.
+   *  Get(keys, value*), get value by given key list and apply it to the given
+   * value pointer
+   *    with the specified offset.
+   *
+   */
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+    rwlock_.reset(new RWLock);
+  }
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+    rwlock_.reset(new RWLock);
+  }
+  TargetType target() const { return value_->target(); }
+  const Tensor& value() const { return *value_; }
+  Tensor* mutable_value() { return value_.get(); }
+  int64_t height() const { return height_; }
+  void set_height(int64_t height) { height_ = height; }
+  const std::vector<int64_t>& rows() const { return rows_; }
+  std::vector<int64_t>* mutable_rows() { return &rows_; }
+  void set_rows(const std::vector<int64_t>& rows) { rows_ = rows; }
+  /*
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      PADDLE_THROW("id %s not in table", key);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+  /*
+   * @brief whether has the specified key in the table.
+   *
+   * @return true if the key is exists.
+   */
+  bool HasKey(int64_t key) const;
+  /*
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
+   *
+   * @return a list of pair which contains the non-exists key and the index in
+   * the value
+   */
+  void Get(const lite::Tensor& ids,
+           lite::Tensor* value,
+           bool auto_grown = false,
+           bool is_test = false);
+  /*
+   * @brief Get the index of the key from id_to_index_ map. If the key not
+   * exist,
+   * add the key into id_to_index_.
+   *
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
+   *
+   * @return index of the key.
+   */
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false);
+  /*
+   * @brief Get the index of the key from id_to_index_ map.
+   */
+  inline int64_t GetIndexFromId(int64_t key) {
+    auto iter = id_to_index_.find(key);
+    if (iter == id_to_index_.end()) {
+      return -1;
+    } else {
+      return iter->second;
+    }
+  }
+  void SyncIndex();
+  /*
+   * @brief Get complete Dims before
+   */
+  DDim GetCompleteDims() const {
+    DDim dims = value_->dims();
+    dims[0] = height_;
+    return dims;
+  }
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simply concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  std::vector<int64_t> rows_;
+  std::unordered_map<int64_t, int64_t>
+      id_to_index_;  // should not be used when rows_ has duplicate member
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;  // height indicates the underline tensor's height
+  std::unique_ptr<RWLock> rwlock_{nullptr};
+};
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os,
+                       const SelectedRows& selected_rows,
+                       const lite::Context<lite::TargetType::kX86>& dev_ctx);
+void DeserializeFromStream(
+    std::istream& is,
+    SelectedRows* selected_rows,
+    const lite::Context<lite::TargetType::kX86>& dev_ctx);
+}  // namespace fluid
+}  // namespace lite
+}  // namespace paddle
--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
@@ -72,7 +72,7 @@ void SerializeTensor(std::ostream& os,
 // LoDTensor to ostream
 void TensorToStream(std::ostream& os, const lite::Tensor& tensor);
+void TensorFromStream(std::istream& is, lite::Tensor* tensor);
 void ReadBinaryFile(const std::string& filename, std::string* contents);
 // For naive buffer