diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 812612580cf53f12d8c4c267fd0060ee6034d0aa..16f0a4c6ffa39bcd0ea1a407c6d9b06ebfecdf60 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -36,14 +36,15 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
 
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+
   auto size = src.numel() * SizeOfType(src.type());
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
-      return;
-    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -79,11 +80,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
-      if (src_ptr == dst_ptr) {
-        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-                << dst_place;
-        return;
-      }
       memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                    stream);
     } else {
@@ -127,13 +123,15 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   auto src_place = src.place();
   auto src_ptr = src.data<void>();
   auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+
   auto size = src.numel() * SizeOfType(src.type());
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
-      return;
-    }
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
@@ -153,11 +151,6 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
     platform::RecordEvent record_event("TensorCopy:GPU->GPU");
-    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
-      return;
-    }
     auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
     auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
     memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr);
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index 94483bc7f4cd87316c20b3280520dfb633e2bf86..c7cd230a450a9d078024c2f9a29947fc459fd9c0 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -205,6 +205,11 @@ class LoDResetGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+DECLARE_INPLACE_OP_INFERER(LodResetInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(LodResetGradInplaceInferer,
+                           {framework::GradVarName("Out"),
+                            framework::GradVarName("X")});
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
                                       "X");
 
@@ -215,9 +220,10 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
                   ops::LoDResetGradMaker<paddle::framework::OpDesc>,
                   ops::LoDResetGradMaker<paddle::imperative::OpBase>,
-                  ops::LoDResetOpVarTypeInference);
+                  ops::LoDResetOpVarTypeInference, ops::LodResetInplaceInferer);
 REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
-                  ops::LoDResetGradNoNeedBufferVarInference);
+                  ops::LoDResetGradNoNeedBufferVarInference,
+                  ops::LodResetGradInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 7677fa2251f0b2fefe84f5baa0eb2763fbae21c1..87e8c31a9d0ae567de31e7a120d48ec8cadf755c 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -31,7 +31,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
     auto* lod_t = ctx.Input<framework::LoDTensor>("Y");
     bool append = ctx.Attr<bool>("append");
 
-    out->ShareDataWith(*in);
+    framework::TensorCopy(*in, in->place(), out);
 
     std::vector<int> level0;
     if (lod_t) {
@@ -45,8 +45,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
         return;  // early return, since lod already set
       } else {
         auto* lod = lod_t->data<int>();
+        framework::Tensor lod_cpu;
         if (platform::is_gpu_place(lod_t->place())) {
-          framework::Tensor lod_cpu;
           framework::TensorCopySync(*lod_t, platform::CPUPlace(), &lod_cpu);
           lod = lod_cpu.data<int>();
         }
@@ -90,7 +90,7 @@ class LoDResetGradKernel : public framework::OpKernel<T> {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
-    d_x->ShareDataWith(*d_out);
+    framework::TensorCopy(*d_out, d_out->place(), d_x);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index 01d66901afc49a487c344b039b65f547967e95ff..4718ae915a66372df522cd24c3b9ef936a7b63c7 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -34,8 +34,7 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
 
     if (in_x->dims() == in_y->dims()) {
-      // TensorCopy(in_y, context.GetPlace(), context, out);
-      out->ShareDataWith(*in_y);
+      framework::TensorCopy(*in_y, context.GetPlace(), out);
       return;
     }
 
@@ -70,8 +69,7 @@ class PadConstantLikeGradKernel : public framework::OpKernel<T> {
     }
 
     if (in_dout->dims() == in_y->dims()) {
-      // TensorCopy(in_dout, context.GetPlace(), context, d_y);
-      d_y->ShareDataWith(*in_dout);
+      framework::TensorCopy(*in_dout, context.GetPlace(), d_y);
       return;
     }
 
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index fb49793b730f72d66dc846f233bd95ebdab37c52..4bcd27036a53017e437e4819d00d886172040adb 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -155,8 +155,15 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
           context.Input<Tensor>("CustomizedSamples");
       const Tensor* customized_probabilities =
           context.Input<Tensor>("CustomizedProbabilities");
-      samples->ShareDataWith(*customized_samples);
-      probabilities->ShareDataWith(*customized_probabilities);
+      PADDLE_ENFORCE_EQ(customized_samples, samples,
+                        platform::errors::InvalidArgument(
+                            "CustomizedSamples must be the same Tensor with "
+                            "Samples when use_customized_samples = True"));
+      PADDLE_ENFORCE_EQ(
+          customized_probabilities, probabilities,
+          platform::errors::InvalidArgument(
+              "CustomizedProbabilities must be the same Tensor with "
+              "Probabilities when use_customized_samples = True"));
     } else {
       samples->mutable_data<int64_t>(context.GetPlace());
       probabilities->mutable_data<T>(samples_dim, context.GetPlace());
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index 18ef6c9d3fe62a913413cf8c84e23b7c6accfc5c..5c0d3a677db679f1f70186ce5b57ca7c79263ab9 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -195,8 +195,15 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
           context.Input<Tensor>("CustomizedSamples");
       const Tensor* customized_probabilities =
           context.Input<Tensor>("CustomizedProbabilities");
-      samples->ShareDataWith(*customized_samples);
-      probabilities->ShareDataWith(*customized_probabilities);
+      PADDLE_ENFORCE_EQ(customized_samples, samples,
+                        platform::errors::InvalidArgument(
+                            "CustomizedSamples must be the same Tensor with "
+                            "Samples when use_customized_samples = True"));
+      PADDLE_ENFORCE_EQ(
+          customized_probabilities, probabilities,
+          platform::errors::InvalidArgument(
+              "CustomizedProbabilities must be the same Tensor with "
+              "Probabilities when use_customized_samples = True"));
     } else {
       samples->mutable_data<int64_t>(context.GetPlace());
       probabilities->mutable_data<T>(samples_dim, context.GetPlace());
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index d763f74d4ee6e66b8ec6de5d3e0c11679c664687..6ba5ba7bb277c21384c12b60f9f573e4afc31513 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -130,14 +130,21 @@ class ScatterGradMaker : public framework::SingleGradOpMaker<T> {
 DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterGradNoNeedBufferVarsInference,
                                       "Updates");
 
+DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"});
+DECLARE_INPLACE_OP_INFERER(ScatterGradInplaceInferer,
+                           {framework::GradVarName("Out"),
+                            framework::GradVarName("X")});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker,
                   ops::ScatterGradMaker<paddle::framework::OpDesc>,
-                  ops::ScatterGradMaker<paddle::imperative::OpBase>);
+                  ops::ScatterGradMaker<paddle::imperative::OpBase>,
+                  ops::ScatterInplaceInferer);
 REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp,
-                  ops::ScatterGradNoNeedBufferVarsInference);
+                  ops::ScatterGradNoNeedBufferVarsInference,
+                  ops::ScatterGradInplaceInferer);
 REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
 REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index 6c4da760ce828e49b55c5d488958e1039fe62702..f3e0faa164c2f95bcb2402a7ba92847cfb67167c 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -32,7 +32,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
     auto *Out = ctx.Output<Tensor>("Out");
     bool overwrite = ctx.Attr<bool>("overwrite");
 
-    Out->ShareDataWith(*X);
+    framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // use template class to support int32_t and int64_t
     const auto &index_type = Ids->type();
     bool index_type_match = index_type == framework::proto::VarType::INT32 ||
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 97254f817d9856aca9ffe1a101551b902541d9cf..b4043b6ebbf8ba2ac77ecbbd3427db13bdf99397 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -36,7 +36,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     double overwrite = ctx.Attr<bool>("overwrite");
 
     // In place output: Out = X, Out[Ids] = Updates
-    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
+    framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // Apply ScatterUpdate: Out[index] = Updates[:]
     const auto &index_type = Ids->type();
     bool index_type_match = index_type == framework::proto::VarType::INT32 ||
@@ -76,7 +76,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
 
     if (dX) {
       // In place gradient: dX = dO
-      framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
+      framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
     }
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 2b3833b12a65b61b794cc80973d4b32cfbc1d76d..e5eaa270afec60bd62047e5c607d5e5a1c67cbf2 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -1060,8 +1060,9 @@ def sampled_softmax_with_cross_entropy(logits,
                       logits=fc, label=label, num_samples=25)
     """
     helper = LayerHelper('sample_logits', **locals())
-    samples = helper.create_variable_for_type_inference(dtype='int64')
-    probabilities = helper.create_variable_for_type_inference(
+    samples = customized_samples if use_customized_samples else helper.create_variable_for_type_inference(
+        dtype='int64')
+    probabilities = customized_probabilities if use_customized_samples else helper.create_variable_for_type_inference(
         dtype=logits.dtype)
     sampled_logits \
         = helper.create_variable_for_type_inference(dtype=logits.dtype)