diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 35d251f71a0cb631d5900498ea3188b5ddeae334..17e576a9d5c8f50fbe84b066a93460f03ae6bb08 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -371,6 +371,8 @@ template struct RowwiseAdd<platform::CPUDeviceContext, double>;
 
 template struct ColwiseSum<platform::CPUDeviceContext, float>;
 template struct ColwiseSum<platform::CPUDeviceContext, double>;
+template struct ColwiseSum<platform::CPUDeviceContext, int>;
+template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
 
 template struct RowwiseSum<platform::CPUDeviceContext, float>;
 template struct RowwiseSum<platform::CPUDeviceContext, double>;
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 3abbcdb71d03eaf6f8eba3d97150d27ac5a5405e..c6ca2693a053360ce5dc44765acf1520a11cce2c 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -422,6 +422,8 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
 template struct RowwiseAdd<platform::CUDADeviceContext, float>;
 template struct RowwiseAdd<platform::CUDADeviceContext, double>;
 template struct ColwiseSum<platform::CUDADeviceContext, float>;
+template struct ColwiseSum<platform::CUDADeviceContext, int>;
+template struct ColwiseSum<platform::CUDADeviceContext, int64_t>;
 // template struct ColwiseSum<platform::CUDADeviceContext, double>;
 // The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
 // and only failed for this case. So reimplemented it.
diff --git a/paddle/fluid/operators/sequence_expand_op.cc b/paddle/fluid/operators/sequence_expand_op.cc
index acb6eb82a23a1cd122c4159022985eaf0d3fde7c..25a8283858bf5f867a083dd4f581386bfcf17076 100644
--- a/paddle/fluid/operators/sequence_expand_op.cc
+++ b/paddle/fluid/operators/sequence_expand_op.cc
@@ -33,9 +33,10 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                    "Output(Out) of SequenceExpandOp should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
+    int ref_level = ctx->Attrs().Get<int>("ref_level");
+
     PADDLE_ENFORCE_EQ(x_dims.size(), 2U,
                       "Dimension number of Input(X) should be 2.");
-    int ref_level = ctx->Attrs().Get<int>("ref_level");
 
     if (ctx->IsRuntime()) {
       framework::Variable* x_var =
@@ -51,39 +52,37 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
                         "greater than 1.");
 
       PADDLE_ENFORCE(x_lod.size() == y_lod.size() || x_lod.size() == 0,
-                     "Number of lod level of Input(X) either equal to 0 "
-                     "or equal to that of Input(Y).");
+                     "Level number of Input(X)'s lod should be either equal "
+                     "to 0 or equal to that of Input(Y).");
+
+      PADDLE_ENFORCE_GT(y_lod.size(), 0,
+                        "Level number of Input(Y)'s lod should be "
+                        "greater than 0.");
+
+      PADDLE_ENFORCE(
+          ref_level == -1 ||
+              (ref_level >= 0 && ref_level < static_cast<int>(y_lod.size())),
+          "Invlid `ref_level`, which should be either equal to -1 "
+          "or in [0, %d)",
+          y_lod.size());
+
+      if (ref_level == -1) ref_level = y_lod.size() - 1;
 
       int64_t out_first_dim = 0;
-      if (y_lod[ref_level].size() < 1) {
+      if (y_lod[ref_level].size() <= 1) {
         out_first_dim = x_dims[0];
       } else {
-        if (x_lod.size() == 1) {  // X is LoDTensor
-          for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-            int x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
-            out_first_dim +=
-                (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
-          }
-        } else {  // X is normal Tensor
-          for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-            out_first_dim += y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+        for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+          int x_seq_len = 1;
+          if (x_lod.size() == 1) {
+            x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
           }
+          out_first_dim +=
+              (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
         }
       }
       ctx->SetOutputDim("Out", {out_first_dim, x_dims[1]});
     } else {
-      framework::VarDesc* in_reader =
-          boost::get<framework::VarDesc*>(ctx->GetInputVarPtrs("Y")[0]);
-      int lod_level_num = in_reader->GetLoDLevels().size();
-
-      PADDLE_ENFORCE_GE(ref_level, 0,
-                        "Level of referred lod should be greater or "
-                        "equal to 0.");
-
-      PADDLE_ENFORCE_LT(ref_level, lod_level_num,
-                        "Level of referred lod should be smaller than "
-                        "level number of Input(Y).");
-
       ctx->SetOutputDim("Out", {-1, x_dims[1]});
     }
   }
@@ -102,7 +101,7 @@ class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(LodTensor, default LoDTensor<float>) Output LoDTensor which is "
               "generated from Input(X) by referring lod of Input(Y).");
-    AddAttr<int>("ref_level", "Specify lod level of Input(Y).");
+    AddAttr<int>("ref_level", "Specify lod level of Input(Y).").SetDefault(-1);
     AddComment(R"DOC(
 Sequence Expand Operator.
 
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index 2b4fa016f73efc80aecf912e25504fb49bf67941..8cbfdf177e0a2bf762923bbd07de67ae99584be0 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -32,52 +32,53 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
     auto* out = context.Output<LoDTensor>("Out");
     int ref_level = context.Attr<int>("ref_level");
 
+    out->mutable_data<T>(context.GetPlace());
     auto& x_lod = x->lod();
     auto& y_lod = y->lod();
 
-    PADDLE_ENFORCE_GE(ref_level, 0,
-                      "Value of attribute `ref_level` should be greater or "
-                      "equal to 0.");
+    PADDLE_ENFORCE_GT(y_lod.size(), 0,
+                      "Level number of `Y`'s lod should be greater than 0.");
 
-    PADDLE_ENFORCE_LT(ref_level, y_lod.size(),
-                      "Value of attribute `ref_level` should be smaller than "
-                      "level number of Y's lod.");
+    PADDLE_ENFORCE(
+        ref_level == -1 || (ref_level >= 0 && ref_level < y_lod.size()),
+        "Invlid `ref_level`, which should be either equal to -1 "
+        "or in [0, %d)",
+        y_lod.size());
 
-    if (y_lod[ref_level].size() < 1) {
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    if (y_lod[ref_level].size() <= 1) {
       framework::TensorCopy(*x, context.GetPlace(), out);
       return;
     }
 
-    if (x_lod.size() == 0) {
-      int out_start = 0;
-      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-        auto x_sub_tensor = x->Slice(i - 1, i);
-        for (size_t j = 0; j < repeat_num; ++j) {
-          auto out_sub_tensor = out->Slice(out_start, out_start + 1);
-          framework::TensorCopy(x_sub_tensor, context.GetPlace(),
-                                &out_sub_tensor);
-          out_start++;
-        }
-      }
-    } else {
-      auto& out_lod = *out->mutable_lod();
+    auto& out_lod = *out->mutable_lod();
+    if (x_lod.size() == 1) {
       out_lod.resize(1);
-      out_lod[0].resize(1);
-      out_lod[0][0] = 0;
-      int out_idx = 0;
-      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-        int x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
-        auto x_sub_tensor = x->Slice(x_lod[0][i], x_lod[0][i - 1]);
-        for (size_t j = 0; j < repeat_num; ++j) {
-          auto out_sub_tensor =
-              out->Slice(out_lod[0][out_idx], out_lod[0][out_idx] + x_seq_len);
-          framework::TensorCopy(x_sub_tensor, context.GetPlace(),
-                                &out_sub_tensor);
-          out_lod[0].push_back(out_lod[0][out_idx] + x_seq_len);
-          out_idx++;
+      out_lod[0] = {0};
+    }
+
+    int out_offset = 0;
+    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      int x_start = i - 1;
+      int x_end = i;
+      if (x_lod.size() == 1) {
+        x_start = x_lod[0][i - 1];
+        x_end = x_lod[0][i];
+      }
+      int x_seq_len = x_end - x_start;
+      auto x_sub_tensor = x->Slice(x_start, x_end);
+      for (size_t j = 0; j < repeat_num; ++j) {
+        int out_start = out_offset;
+        if (x_lod.size() == 1) {
+          out_start = out_lod[0][out_offset];
+          out_lod[0].push_back(x_seq_len);
         }
+        auto out_sub_tensor = out->Slice(out_start, out_start + x_seq_len);
+        framework::TensorCopy(x_sub_tensor, context.GetPlace(),
+                              &out_sub_tensor);
+        out_offset++;
       }
     }
   }
@@ -99,27 +100,49 @@ template <typename DeviceContext, typename T>
 class SequenceExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* g_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = context.Input<LoDTensor>("X");
-    auto* out = context.Input<LoDTensor>("Out");
-    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
-    auto out_last_level = out->lod().back();
-    d_x->set_lod(x->lod());
-    const T* d_out_data = d_out->data<T>();
-    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
-    size_t element_len = d_out->numel() / d_out->dims()[0];
-    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
-      size_t repeat = out_last_level[i + 1] - out_last_level[i];
-      Eigen::TensorMap<
-          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
-      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
-      d_x_t(d_x_data, static_cast<int>(element_len));
-      auto place =
-          context.template device_context<DeviceContext>().eigen_device();
-      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
-      d_out_data += (repeat * element_len);
-      d_x_data += element_len;
+    auto* y = context.Input<LoDTensor>("Y");
+    auto* g_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    int ref_level = context.Attr<int>("ref_level");
+
+    g_x->mutable_data<T>(context.GetPlace());
+    g_x->set_lod(x->lod());
+
+    auto& x_lod = x->lod();
+    auto& y_lod = y->lod();
+
+    if (ref_level == -1) ref_level = y_lod.size() - 1;
+
+    // just copy the gradient
+    if (y_lod[ref_level].size() <= 1) {
+      framework::TensorCopy(*g_out, context.GetPlace(), g_x);
+      return;
+    }
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    int g_out_offset = 0;
+    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
+      int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
+      if (repeat_num > 0) {
+        int x_start = i - 1;
+        int x_end = i;
+        if (x_lod.size() == 1) {
+          x_start = x_lod[0][i - 1];
+          x_end = x_lod[0][i];
+        }
+        int x_seq_len = x_end - x_start;
+        auto column = x_seq_len * x->dims()[1];
+        auto g_x_sub = g_x->Slice(x_start, x_end);
+        g_x_sub = framework::ReshapeToMatrix(g_x_sub, column);
+        int g_out_end = g_out_offset + repeat_num * x_seq_len;
+        auto g_out_sub = g_out->Slice(g_out_offset, g_out_end);
+        g_out_sub = framework::ReshapeToMatrix(g_out_sub, column);
+        math::ColwiseSum<DeviceContext, T> col_sum;
+        col_sum(dev_ctx, g_out_sub, &g_x_sub);
+        g_out_offset += repeat_num * x_seq_len;
+      }
     }
   }
 };