diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 1d7a2eb5b38255531880fe3d2e5321024caf0c6b..de77d189c8591c730741da0152359e0c279d3ae8 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -114,6 +114,11 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -132,6 +137,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
              platform::is_gpu_place(dst_place)) {
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    if (src_ptr == dst_ptr &&
+        src_gpu_place.GetDeviceId() == dst_gpu_place.GetDeviceId()) {
+      VLOG(3) << "Skip copy the same data from " << src.place() << " to "
+              << dst_place;
+      return;
+    }
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
   }
 #endif
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index b8fdc3f826662a9ca7ec3b3bf2a4da7308a757ab..500d86fec33830fc2cfb0412f1f2c7780d08eb02 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -195,7 +195,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
 class ReshapeKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -228,15 +227,12 @@ class ReshapeKernel {
           "sequence_reshape op.");
     }
 
-    if (in->data<T>() !=
-        reinterpret_cast<T *>(out->mutable_data(ctx.GetPlace(), in->type()))) {
-      framework::TensorCopySync(*in, ctx.GetPlace(), out);
-    }
+    out->mutable_data(ctx.GetPlace(), in->type());
+    framework::TensorCopySync(*in, ctx.GetPlace(), out);
     out->Resize(out_dims);
   }
 };
 
-template <typename T>
 class ReshapeGradKernel {
  public:
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -244,9 +240,8 @@ class ReshapeGradKernel {
     auto *d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto in_dims = d_x->dims();
 
-    if (d_out->data<T>() != d_x->mutable_data(ctx.GetPlace(), d_out->type())) {
-      framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
-    }
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x);
     d_x->Resize(in_dims);
   }
 };
@@ -341,46 +336,38 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(reshape, ops::ReshapeOp, ops::ReshapeOpMaker,
                   paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
-                               double, ops::ReshapeKernel<double>, int,
-                               ops::ReshapeKernel<int>, int64_t,
-                               ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float,
-                               ops::ReshapeGradKernel<float>, double,
-                               ops::ReshapeGradKernel<double>, int,
-                               ops::ReshapeGradKernel<int>, int64_t,
-                               ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
-                               double, ops::ReshapeKernel<double>, int,
-                               ops::ReshapeKernel<int>, int64_t,
-                               ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float,
-                               ops::ReshapeGradKernel<float>, double,
-                               ops::ReshapeGradKernel<double>, int,
-                               ops::ReshapeGradKernel<int>, int64_t,
-                               ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               int64_t, ops::ReshapeKernel);
+REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                               double, ops::ReshapeGradKernel, int,
+                               ops::ReshapeGradKernel, int64_t,
+                               ops::ReshapeGradKernel);
 
 #ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel<float>,
-                                double, ops::ReshapeKernel<double>, int,
-                                ops::ReshapeKernel<int>, int64_t,
-                                ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float,
-                                ops::ReshapeGradKernel<float>, double,
-                                ops::ReshapeGradKernel<double>, int,
-                                ops::ReshapeGradKernel<int>, int64_t,
-                                ops::ReshapeGradKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel<float>,
-                                double, ops::ReshapeKernel<double>, int,
-                                ops::ReshapeKernel<int>, int64_t,
-                                ops::ReshapeKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float,
-                                ops::ReshapeGradKernel<float>, double,
-                                ops::ReshapeGradKernel<double>, int,
-                                ops::ReshapeGradKernel<int>, int64_t,
-                                ops::ReshapeGradKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
+                                ops::ReshapeKernel, int, ops::ReshapeKernel,
+                                int64_t, ops::ReshapeKernel);
+REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
+                                double, ops::ReshapeGradKernel, int,
+                                ops::ReshapeGradKernel, int64_t,
+                                ops::ReshapeGradKernel);
 #endif