From 39f41cb47f2bbbc55bc4c65182a4607a50e2ef37 Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Fri, 5 Feb 2021 19:22:41 +0800
Subject: [PATCH] Performance optimization for dynamic setitem: Call op
 set_value to speed up because the original call to TensorToPyArray will
 introduce unnecessary data copy. (#30817)

---
 paddle/fluid/pybind/imperative.cc | 82 ++++++++++++++++++++++++++-----
 1 file changed, 69 insertions(+), 13 deletions(-)
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 4d2a7b6a4de..6d20c867570 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -583,26 +583,82 @@ void BindImperative(py::module *m_ptr) {
               py::object &value_obj) {
              auto self_tensor =
                  self->MutableVar()->GetMutable<framework::LoDTensor>();
-             auto self_numpy = TensorToPyArray(*self_tensor);
+             PyObject *index_ptr = !PyTuple_Check(_index.ptr())
+                                       ? PyTuple_Pack(1, _index.ptr())
+                                       : _index.ptr();
+             // 1. Check argumnets
+             // 1.1 Check whether _index can be parsed.
+             bool parse_index = true;
+             const int size = PyTuple_GET_SIZE(index_ptr);
+             for (int dim = 0; dim < size; ++dim) {
+               PyObject *slice_item = PyTuple_GetItem(index_ptr, dim);
+               if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item))) {
+                 parse_index = false;
+                 break;
+               }
+             }
+
+             // 1.2 Check whether stride is 1.
+             std::vector<int> axes, starts, ends, strides, decrease_axis,
+                 infer_flags;
+
+             bool stride_is_1 = true;
+             if (parse_index) {
+               ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends,
+                                  &strides, &decrease_axis, &infer_flags);
+               stride_is_1 =
+                   std::all_of(strides.cbegin(), strides.cend(),
+                               [](int64_t stride) { return stride == 1; });
+             }
 
+             // 1.3 Check whether value obj is a tensor.
+             bool value_is_tensor = true;
              if (py::isinstance<py::array>(value_obj) ||
                  py::isinstance<py::int_>(value_obj) ||
                  py::isinstance<py::float_>(value_obj)) {
-               auto value_numpy = value_obj;
-               self_numpy[_index] = value_numpy;
-               SetTensorFromPyArray(self_tensor, self_numpy,
-                                    self_tensor->place(), true);
+               value_is_tensor = false;
+             }
+
+             // 2. Call op set_value to speed up if the condition is met,
+             // otherwise call TensorToPyArray.
+             // TODO(liym27): Try not to call TensorToPyArray because it always
+             // copys data to cpu place, which reduces performance.
+             if (parse_index && stride_is_1 && value_is_tensor) {
+               framework::AttributeMap attrs = {
+                   {"axes", axes}, {"starts", starts}, {"ends", ends}};
+
+               imperative::NameVarBaseMap ins = {{"Input", {self}}};
+               imperative::NameVarBaseMap outs = {{"Out", {self}}};
 
-             } else {
-               auto value =
-                   value_obj.cast<std::shared_ptr<imperative::VarBase>>();
                auto value_tensor =
-                   value->MutableVar()->GetMutable<framework::LoDTensor>();
-               auto value_numpy = TensorToPyArray(*value_tensor);
+                   value_obj.cast<std::shared_ptr<imperative::VarBase>>();
+               ins.insert({"ValueTensor", {value_tensor}});
 
-               self_numpy[_index] = value_numpy;
-               SetTensorFromPyArray(self_tensor, self_numpy,
-                                    self_tensor->place(), true);
+               const auto &tracer = imperative::GetCurrentTracer();
+               {
+                 // Release gil and do tracing
+                 py::gil_scoped_release release;
+                 tracer->TraceOp("set_value", ins, outs, std::move(attrs));
+               }
+             } else {
+               auto self_numpy = TensorToPyArray(*self_tensor);
+
+               if (value_is_tensor) {
+                 auto value =
+                     value_obj.cast<std::shared_ptr<imperative::VarBase>>();
+                 auto value_tensor =
+                     value->MutableVar()->GetMutable<framework::LoDTensor>();
+                 auto value_numpy = TensorToPyArray(*value_tensor);
+
+                 self_numpy[_index] = value_numpy;
+                 SetTensorFromPyArray(self_tensor, self_numpy,
+                                      self_tensor->place(), true);
+               } else {
+                 auto value_numpy = value_obj;
+                 self_numpy[_index] = value_numpy;
+                 SetTensorFromPyArray(self_tensor, self_numpy,
+                                      self_tensor->place(), true);
+               }
              }
              // NOTE(liym27):
              // Increase the version of VarBase self because __setitem__ is an
-- 
GitLab