From 9ec1432df691ee6734d3a0c61d81d7da1b2b4e0c Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Mon, 22 Nov 2021 20:04:46 +0800
Subject: [PATCH] disable copying of datatype when sharing buffer between two
 tensors. (#37247)

* disable copying of datatype when sharing buffer between two tensors.
* fix for mkldnn operator kernels (elementwise_add, sum, softplus, softmax, scale, activation), mannually set the data type when reusing memory by ShareBufferWith.
---
 paddle/fluid/framework/tensor.h               |  5 ++++-
 .../mkldnn/elementwise_mkldnn_op.h            | 19 +++++++++++++++---
 .../operators/mkldnn/activation_mkldnn_op.cc  |  8 +++++++-
 .../fluid/operators/mkldnn/scale_mkldnn_op.cc |  9 +++++++--
 .../operators/mkldnn/softmax_mkldnn_op.cc     | 10 +++++++---
 .../operators/mkldnn/softplus_mkldnn_op.h     | 20 ++++++++++++-------
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   |  9 +++++++--
 7 files changed, 61 insertions(+), 19 deletions(-)
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index e889de8552d..ec990213433 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -254,7 +254,10 @@ class Tensor {
   void ShareBufferWith(const Tensor& tensor) {
     holder_ = tensor.holder_;
     offset_ = tensor.offset_;
-    type_ = tensor.type_;
+    // NOTE(chenfeiyu): when sharing buffer, by definition only holder
+    // to the memory allocation and offset should be shared. Shape,
+    // data type, layout, and other metadata associated with a Tensor
+    // should not be copied.
   }
 
   bool IsSharedBufferWith(const Tensor& src) const {
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 131ea3901da..1439cccc3bf 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -62,9 +62,22 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     // and they share a buffer (of
     // shape x) then this buffer is not big enough to hold result of elementwise
     // operation.
-    auto dst_memory = (x->numel() == z->numel() && x->IsSharedBufferWith(*z))
-                          ? src_x_memory
-                          : handler.AcquireDstMemory(z);
+    const bool reuse_x_memopry =
+        x->numel() == z->numel() && x->IsSharedBufferWith(*z);
+    std::shared_ptr<dnnl::memory> dst_memory = nullptr;
+    if (reuse_x_memopry) {
+      dst_memory = src_x_memory;
+      // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
+      // than allocate its own, it's still need to take care of its data type.
+      // Unfortunately, paddle's operator only infers the output' shape, but not
+      // the data type. mutable_data<T> takes care of allocation and data type
+      // normally, but if the memory is already allocated and there is no need
+      // to re-allocate, it just set the data type. So this it added there to
+      // get the right data type.
+      z->mutable_data<T>(ctx.GetPlace());
+    } else {
+      dst_memory = handler.AcquireDstMemory(z);
+    }
 
     const auto binary_prim = handler.AcquireForwardPrimitive();
 
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 29e18da590e..4bde641d2c1 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -91,7 +91,13 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
                                                ctx.GetPlace(), x);
 
   auto src_memory_p = handler.AcquireSrcMemory(x);
-  auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y);
+  std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
+  if (is_inplaced) {
+    dst_memory_p = src_memory_p;
+    y->mutable_data<T>(ctx.GetPlace());
+  } else {
+    dst_memory_p = handler.AcquireDstMemory(y);
+  }
   auto activation_p = handler.AcquireForwardPrimitive();
 
   auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
index 84ac14d04b8..b8b735e96d2 100644
--- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
@@ -41,8 +41,13 @@ class ScaleMKLDNNKernel : public framework::OpKernel<T> {
         x);
 
     auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p =
-        is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
+    std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
+    if (is_inplaced) {
+      dst_memory_p = src_memory_p;
+      out->mutable_data<T>(ctx.GetPlace());
+    } else {
+      dst_memory_p = handler.AcquireDstMemory(out);
+    }
     auto activation_p = handler.AcquireForwardPrimitive();
 
     auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index b0f27719bf9..c26c017596d 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -103,9 +103,13 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
 
     auto softmax_src_memory_p = handler.AcquireSrcMemory(input);
     // For Inplace src and and dst are the same memory object
-    auto softmax_dst_memory_p =
-        is_inplaced ? softmax_src_memory_p : handler.AcquireDstMemory(output);
-
+    std::shared_ptr<dnnl::memory> softmax_dst_memory_p = nullptr;
+    if (is_inplaced) {
+      softmax_dst_memory_p = softmax_src_memory_p;
+      output->mutable_data<T>(ctx.GetPlace());
+    } else {
+      softmax_dst_memory_p = handler.AcquireDstMemory(output);
+    }
     auto softmax_p = handler.AcquireForwardPrimitive();
 
     auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
index 60ea5136905..c8c539a9565 100644
--- a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
@@ -43,7 +44,7 @@ class SoftplusMKLDNNHandler
                               1.0f / beta, 0.0f);
     }
 
-    AppendFusedActivationIfExists(ctx, post_ops);
+    AppendFusedActivationIfExists(ctx, &post_ops);
 
     dnnl::primitive_attr attrs;
     attrs.set_post_ops(post_ops);
@@ -59,16 +60,16 @@ class SoftplusMKLDNNHandler
 
  private:
   void AppendFusedActivationIfExists(const framework::ExecutionContext& ctx,
-                                     dnnl::post_ops& post_ops) {
+                                     dnnl::post_ops* post_ops) {
     const auto& fused_activation_type =
         algo_map.find(ctx.Attr<std::string>("fuse_activation_type"));
 
     if (fused_activation_type != algo_map.end()) {
       auto scale_out =
           ctx.Attr<float>("fuse_activation_scale");  // for future int8 support
-      post_ops.append_eltwise(scale_out, fused_activation_type->second,
-                              ctx.Attr<float>("fuse_activation_alpha"),
-                              ctx.Attr<float>("fuse_activation_beta"));
+      post_ops->append_eltwise(scale_out, fused_activation_type->second,
+                               ctx.Attr<float>("fuse_activation_alpha"),
+                               ctx.Attr<float>("fuse_activation_beta"));
     }
   }
 
@@ -109,8 +110,13 @@ void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) {
   auto src_memory_p = handler.AcquireSrcMemory(x);
 
   auto beta_memory_p = handler.AcquireBetaMemory(&beta);
-  auto dst_memory_p =
-      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
+  std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
+  if (is_inplaced) {
+    dst_memory_p = src_memory_p;
+    out->mutable_data<T>(ctx.GetPlace());
+  } else {
+    dst_memory_p = handler.AcquireDstMemory(out);
+  }
   auto binary_p = handler.AcquireForwardPrimitive();
 
   auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 8208a484b4a..2760bcecd5b 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -137,8 +137,13 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       ++input_index;
     }
 
-    auto dst_mem = in_place ? handler.AcquireDstMemory()
-                            : handler.AcquireDstMemory(output);
+    std::shared_ptr<dnnl::memory> dst_mem = nullptr;
+    if (in_place) {
+      dst_mem = handler.AcquireDstMemory();
+      output->mutable_data<T>(ctx.GetPlace());
+    } else {
+      dst_mem = handler.AcquireDstMemory(output);
+    }
 
     auto sum_p = handler.AcquireForwardPrimitive();
 
-- 
GitLab