diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index a496301526f58875ff51aeaa5b2094c3c656531c..78be2e1e1f06c7a518e35a770c1dc9581b2d10fe 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -62,9 +62,21 @@ class ConcatGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
     auto out_var_names = ctx.Outputs(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+
+    {
+      auto dx = outs;
+      auto x = ins;
+      for (size_t i = 0; i < dx.size(); ++i) {
+        if (dx[i] != nullptr) {
+          dx[i]->set_lod(x[i]->lod());
+        }
+      }
+    }
+
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
     // get output tensor that the name is not kEmptyVarName
diff --git a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
index c86cd57316078778e5930c9b524b931d523028d7..9ad82aec8182d6ba06b67391d71317a3d0df1833 100644
--- a/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise_add_mkldnn_op.cc
@@ -137,9 +137,10 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
 };
 
 template <typename T>
-class EltwiseAddMKLDNNGradKernel : public framework::OpKernel<T> {
+class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h
index 5356105e2e551c0528694091608fc7585dce66d2..c60cb1f92e99329d52f6ed39dccde406a5f83563 100644
--- a/paddle/fluid/operators/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise_add_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 
@@ -136,9 +137,11 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
 }
 
 template <typename DeviceContext, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel<T> {
+class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+
     using Tensor = framework::Tensor;
 
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h
index 95649ac46e6bd41b9e1a865794cdec3ae1e6e247..41a7950bf0c598507c0fda48c6a43f2fd38c41d2 100644
--- a/paddle/fluid/operators/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise_div_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
-
 namespace paddle {
 namespace operators {
 
@@ -53,9 +53,10 @@ struct DivGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel<T> {
+class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h
index 527a18ee3ba88a158a13266a7fbcdafe59ec69d9..bfb5c931958b4ca890ea720af42dad91d5625abb 100644
--- a/paddle/fluid/operators/elementwise_max_op.h
+++ b/paddle/fluid/operators/elementwise_max_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
@@ -55,9 +56,10 @@ struct MaxGradDy {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
+class ElementwiseMaxGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h
index d4e5831463f3e54c72789b6876ea696cf1b4ef4b..db035ffb52e619b337c8190af4ed0e155aaac48d 100644
--- a/paddle/fluid/operators/elementwise_min_op.h
+++ b/paddle/fluid/operators/elementwise_min_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
-
 namespace paddle {
 namespace operators {
 
@@ -55,9 +55,10 @@ struct MinGradDy {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMinGradKernel : public framework::OpKernel<T> {
+class ElementwiseMinGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h
index 329d2d129a9ea450cd211f0c6d2ea5e37ff8491d..4437da4d95f97b5cbbca1650badf9710c26b4380 100644
--- a/paddle/fluid/operators/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise_mul_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 
@@ -84,9 +85,10 @@ struct MulGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel<T> {
+class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* x = ctx.Input<Tensor>("X");
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index d8a12e800ad733800c1ec333f15d31d4dcd1a3a5..a79b900b9801e6b80e4433a9acdd4dab6c34859d 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -205,6 +205,20 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
   }
 };
 
+template <typename T>
+class ElemwiseGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dx =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    if (dx != nullptr) {
+      auto& dout =
+          *context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+      dx->set_lod(dout.lod());
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h
index 11c7e3fe628001f095836a788f2bcc7c4ee7ad4b..3385df0897700d37d60d8804a01db777ebc02a7e 100644
--- a/paddle/fluid/operators/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise_sub_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/operators/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise_op_function.h"
 
 namespace paddle {
@@ -50,9 +51,10 @@ struct SubGradDY {
 };
 
 template <typename DeviceContext, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel<T> {
+class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
     using Tensor = framework::Tensor;
 
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index 55c8a472aca7fe700ef6a3f96bed1496d7b12b80..fbe7c2978385401b35765101c87387ff727be4e0 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -71,7 +71,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     size_t num = outputs->size();
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 5863d74fca21de8b77bc208fb95d8fd52562f7a7..820e73e779720e4f76168e0a84a254ef645784ee 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -189,7 +189,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index 9e080f2e8be23768dcea47b577043beef37b2eaf..e5d7d860b371677b3cfc67a57390bdee0d0ecc37 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +57,7 @@ template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const std::vector<const framework::Tensor*>& ref_inputs,
+                  const std::vector<const framework::LoDTensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs);
 };
 
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 15dd975e3bbf80b2e616e6628555e812d025f70a..f72824806ed6ee3a4490938403d441326f8a3d4a 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -62,23 +62,31 @@ class MulGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
     int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    const Tensor* x = ctx.Input<Tensor>("X");
-    const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor x_matrix = x->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                                : *x;
-    const Tensor y_matrix = y->dims().size() > 2
-                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                                : *y;
-    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto x_matrix = x->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                        : static_cast<const Tensor&>(*x);
+    auto y_matrix = y->dims().size() > 2
+                        ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                        : static_cast<const Tensor&>(*y);
+    auto* dout = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
 
     Tensor dout_mat;
     dout_mat.ShareDataWith(*dout);
     dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
                      framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
 
-    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::LoDTensor>(framework::GradVarName("Y"));
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     if (dx) {
diff --git a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
index 0ddacb57106c090e8f4f9350a65a30ca102f8e0a..7aca9f7111956dba63e2ceee10077879fe092bdf 100644
--- a/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_softmax_cudnn_op.cu.cc
@@ -68,7 +68,9 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = ctx.Input<LoDTensor>("X");
     auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-
+    if (x_grad) {
+      x_grad->set_lod(x->lod());
+    }
     auto lod = x->lod();
     const size_t level = lod.size() - 1;
 
diff --git a/paddle/fluid/operators/sequence_softmax_op.h b/paddle/fluid/operators/sequence_softmax_op.h
index cb93a02b8386ed50ff176fc25b88449b7eb16902..bca564e16f9951519eefe25126aadebb4c1326b6 100644
--- a/paddle/fluid/operators/sequence_softmax_op.h
+++ b/paddle/fluid/operators/sequence_softmax_op.h
@@ -66,6 +66,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
     auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
     auto* x = ctx.Input<LoDTensor>("X");
     auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (x_grad) {
+      x_grad->set_lod(x->lod());
+    }
 
     auto lod = x->lod();
     const size_t level = lod.size() - 1;