1. reverse commit 94e2ba9f; 2. reduce memory if input get smaller (#1785)

2402029d · NazgulLee · Jiaying Zhao · 57021c48 · 2402029d · 2402029d
5 changed file
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "framework/scope.h"
 #include "framework/tensor.h"
 #include "memory/t_malloc.h"
-#include "pass/memory_optimize.h"
 #include "pass/model_obfuscate.h"
 #ifdef PADDLE_MOBILE_CL
 #include "framework/cl/cl_image.h"
@@ -67,8 +66,9 @@ Executor<Device, T>::Executor(const Program<Device> &program,
 #if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_FPGA_KD) && \
    !defined(PADDLE_MOBILE_CL)
  if (config_.memory_optimization_level != NoMemoryOptimization) {
-    pass::MemoryOptPass()(program_desc_.get(), program_.scope.get(),
-                          config_.memory_optimization_level);
+    memoryOpt_ = std::make_shared<pass::MemoryOptPass>();
+    (*memoryOpt_)(program_desc_.get(), program_.scope.get(),
+                  config_.memory_optimization_level);
  }
 #endif
  // resize feed and fetch list
@@ -299,26 +299,31 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
-      if (!var_desc->Persistable() &&
-          var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        DLOG << "InitNoPersistableMemory var " << var_desc->Name();
-        auto tensor = var->template GetMutable<LoDTensor>();
-        if (tensor->IsInitialized()) {
-          DLOG << "var's tensor is Initialized";
+      auto tensor = var->template GetMutable<LoDTensor>();
+      if (var_desc->Persistable()) {
+        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
+          var->template GetMutable<framework::LoDTensorArray>();
+          continue;
+        }
+      } else {
+        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
-          tensor->template mutable_data_new<T>();
-          DLOG << "var's tensor dims " << tensor_dim;
-          DLOG << "var's tensor new dims " << new_dim;
+          tensor->template mutable_data<T>();
        } else {
-          DLOG << "var's tensor is not Initialized ???";
+          PADDLE_MOBILE_THROW_EXCEPTION("Unsupported var type `%d`",
+                                        var_desc->Type());
        }
      }
    }
  }
+
+  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
+  output->Resize(input_tensor.dims());
+  output->mutable_data<T>();
 }

 template <typename Device, typename T>
@@ -406,9 +411,7 @@ void Executor<Device, T>::SetInput(const Tensor &input,
  target.ShareDataWith(input);
  if (feed_indices_.size() == 1) {
    auto &dim = input.dims();
-    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
-      InitNoPersistableMemory(target);
-    }
+    shouldAdjustMemory_ = (product(dim) < 0.9 * product(input_dim_last_));
    input_dim_has_changed_ = input_dim_last_ != dim;
    input_dim_last_ = static_cast<DDim>(dim);
  }
@@ -430,9 +433,7 @@ void Executor<Device, T>::SetInput(const LoDTensor &input,
  target.set_lod(input.lod());
  if (feed_indices_.size() == 1) {
    auto &dim = input.dims();
-    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
-      InitNoPersistableMemory(target);
-    }
+    shouldAdjustMemory_ = (product(dim) < 0.9 * product(input_dim_last_));
    input_dim_has_changed_ = input_dim_last_ != dim;
    input_dim_last_ = static_cast<DDim>(dim);
  }
@@ -482,7 +483,16 @@ PMStatus Executor<Device, T>::Predict() {
  // clear all no persistable tensor array since write_to_array
  // is always push back a new tensor in the array
  ClearNoPersistableTensorArray(program_desc_.get(), program_.scope.get());
-
+  if (lod_mode_ && input_dim_has_changed_) {
+    for (int i = 0; i < ops_of_block0_.size(); ++i) {
+      auto &op_handler = ops_of_block0_[i];
+      op_handler->InferShape();
+    }
+    if (memoryOpt_ != nullptr && shouldAdjustMemory_) {
+      shouldAdjustMemory_ = false;
+      memoryOpt_->AdjustMemory();
+    }
+  }
 #ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops_of_block0_.size());
  struct timespec ts;
@@ -493,12 +503,12 @@ PMStatus Executor<Device, T>::Predict() {
 #ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
+//    if (lod_mode_ && input_dim_has_changed_) {
+//      op_handler->InferShape();
+//    }
 #endif
    DLOG << i << "th, "
         << "run op: " << op_handler->Type();
-    if (lod_mode_ && input_dim_has_changed_) {
-      op_handler->InferShape();
-    }
    op_handler->Run();
 #ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);

--- a/src/framework/executor.h
+++ b/src/framework/executor.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "framework/program/program.h"
 #include "framework/tensor.h"
 #include "framework/type_trait.h"
+#include "pass/memory_optimize.h"

 namespace paddle_mobile {
 namespace framework {
@@ -104,6 +105,9 @@ class Executor {
  DDim input_dim_last_;
  bool input_dim_has_changed_ = true;

+  bool shouldAdjustMemory_ = false;
+  std::shared_ptr<pass::MemoryOptPass> memoryOpt_;
+
 #ifdef PADDLE_MOBILE_PROFILE
  typedef typename DtypeTensorTrait<Device>::gtype ProfileTensorType;


--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -104,27 +104,14 @@ class Tensor : public TensorBase {
    return *this;
  }

-  template <typename T>
-  inline T *mutable_data_new() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    const kTypeId_t type = type_id<T>().hash_code();
-
+  inline void mutable_data_new() {
    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() != size + offset_) {
-      if (holder_ == nullptr) {
-        holder_.reset(new PlaceholderImpl(size, type));
-      } else {
-        holder_->realloc(size);
+      PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
+      int64_t size = numel() * SizeOfType(holder_->type());
+      if (holder_->size() != size + offset_) {
+        holder_->realloc(size + offset_);
      }
-      offset_ = 0;
    }
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
  }

  inline void *mutable_data(const kTypeId_t type) {

--- a/src/pass/memory_optimize.cpp
+++ b/src/pass/memory_optimize.cpp
@@ -57,6 +57,7 @@ void MemoryOptPass::operator()(
    AppendBlockVars(block.get());

    reused_nodes_.clear();
+    memoryDeputies_.clear();
    // collect all not persistable variables, and accumulate
    // it's reference count
    std::stack<VarNode *> empty_var_nodes;
@@ -156,15 +157,33 @@ void MemoryOptPass::operator()(
      auto *reuse_tensor =
          reused_var->template GetMutable<framework::LoDTensor>();
      reuse_tensor->mutable_data<float>();
+      framework::Variable *deputyVar;
+      int64_t varSize = 0;
      for (const auto &node : list) {
        DLOG << node->name;
        auto *var = scope->Var(node->name);
        auto *tensor = var->template GetMutable<framework::LoDTensor>();
        tensor->ShareHolderWith(*reuse_tensor);
+        if (tensor->numel() > varSize) {
+          varSize = tensor->numel();
+          deputyVar = var;
+        }
+      }
+      if (deputyVar) {
+        memoryDeputies_.push_back(deputyVar);
      }
    }
  }
 }

+void MemoryOptPass::AdjustMemory() {
+  for (auto &deputy : memoryDeputies_) {
+    if (deputy->IsType<framework::LoDTensor>()) {
+      auto *tensor = deputy->template GetMutable<framework::LoDTensor>();
+      tensor->mutable_data_new();
+    }
+  }
+}
+
 }  // namespace pass
 }  // namespace paddle_mobile
--- a/src/pass/memory_optimize.h
+++ b/src/pass/memory_optimize.h
@@ -51,11 +51,14 @@ class MemoryOptPass : public PassBase {

  VarNode *CreateNode(const std::string name);

+  void AdjustMemory();
+
 private:
  std::stack<VarNode *> analysis_nodes_;
  std::vector<std::vector<VarNode *>> reused_nodes_;
  std::unordered_map<std::string, VarNode *> created_nodes_;
  std::unordered_map<std::string, framework::VarDesc *> block_vars_;
+  std::vector<framework::Variable *> memoryDeputies_;
 };

 }  // namespace pass