paddle mobile runtime cl memory optimise. test=develop (#2160)

528fd741 · xiebaiyuan · GitHub · 5f227934 · 528fd741 · 528fd741
4 changed file
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -146,20 +146,26 @@ class CLImage {
    initialized_ = true;
    DLOG << " end init cl image";
  }
-  // create fake size cl_mem for mem share
+  /**
+   *  create fake size cl_mem for mem share
+   */
  void InitFakeSizeImage(cl_context context, cl_command_queue command_queue,
-                         const DDim &need_dims, const DDim &real_dims) {
+                         const DDim &need_dims, const DDim &real_image_dims) {
    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
                          " empty image tensor data shouldn't have value");

    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-
-    real_image_dims = normal_converter->InitImageDimInfoWith(real_dims);
-    real_tensor_dims = real_dims;
-
+    // use real image dims to create mem
+    real_image_dims_ = real_image_dims;
+    InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr);
+    // cheat cl_image they got what they wanted
    image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
-
+    DLOG << "InitFakeSizeImage ... ";
+    DLOG << "real_image_dims:  " << real_image_dims_;
+    DLOG << "image_dims_:  " << image_dims_;
+    PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
+                              real_image_dims_[1] >= image_dims_[1],
+                          "real image is not enough");
    tensor_dims_ = need_dims;
    command_queue_ = command_queue;
    image_converter_ = normal_converter;
@@ -167,16 +173,28 @@ class CLImage {
    initialized_ = true;
    DLOG << " end init cl image";
  }
-
-  void InitWithExitedMem(cl_context context, cl_command_queue command_queue,
-                         DDim need_dims, const CLImage &src) {
+  /**
+   * init cl mem with a exist cl mem
+   */
+  void InitWithExistMem(cl_context context, cl_command_queue command_queue,
+                        DDim need_dims, CLImage &src) {
    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();

-    real_image_dims = normal_converter->InitImageDimInfoWith(src.dims());
-    real_tensor_dims = src.dims();
-
+    real_image_dims_ = src.real_image_dims_;
    image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-    // InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+
+    DLOG << "InitWithExistMem ... ";
+    DLOG << "real_image_dims:  " << real_image_dims_;
+    DLOG << "image_dims_:  " << image_dims_;
+    //    PADDLE_MOBILE_ENFORCE(real_image_dims[0] >= image_dims_[0] &&
+    //                              real_image_dims[1] >= image_dims_[1],
+    //                          "real image is not enough!");
+    if (real_image_dims_[0] < image_dims_[0] ||
+        real_image_dims_[1] < image_dims_[1]) {
+      DLOG << "real image is not enough!";
+      DLOG << "real_image_dims:  " << real_image_dims_;
+      DLOG << "image_dims_:  " << image_dims_;
+    }
    if (cl_image_ != src.cl_image_) {
      cl_image_.reset(src.cl_image_.get());
    }
@@ -289,9 +307,7 @@ class CLImage {
  DDim tensor_dims_;
  DDim image_dims_;
  // real image dims usually it is same as image_dims
-  DDim real_image_dims;
-  // real tensor dims usually it is same as tensor dims
-  DDim real_tensor_dims;
+  DDim real_image_dims_;
  float *tensor_data_ = nullptr;
  cl_context context_;
  cl_command_queue command_queue_;

--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "pass/model_obfuscate.h"
 #ifdef PADDLE_MOBILE_CL
 #include "framework/cl/cl_image.h"
-#include "pass/memory_optimize_super.h"
+#include "pass/memory_optimize_cl.h"
 #endif

 namespace paddle_mobile {
@@ -126,6 +126,14 @@ Executor<Device, T>::Executor(const Program<Device> &program,
  printf("================[ op init profile ]==================\n");
  PrintProfile(profile);
 #endif
+
+#ifdef PADDLE_MOBILE_CL
+  if (!config.load_when_predict && !lod_mode &&
+      config_.memory_optimization_level != NoMemoryOptimization) {
+    pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(),
+                            config_.memory_optimization_level);
+  }
+#endif
 }

 template <typename Device, typename T>
@@ -853,10 +861,13 @@ void Executor<GPU_CL, float>::SetInput(const Tensor &input,
      DLOG << "SetInput ---- > resize1";
      input_tensor->Resize(input.dims());
      input_tensor->mutable_data<float>();
-      //     InitNoPersistableMemory(*input_tensor);
-      pass::MemoryOptPassSuper()(program_desc_.get(), program_.scope.get(),
-                                 config_.memory_optimization_level,
-                                 input.dims());
+      if (config_.memory_optimization_level == NoMemoryOptimization) {
+        InitNoPersistableMemory(*input_tensor);
+      } else {
+        pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(),
+                                config_.memory_optimization_level,
+                                input.dims());
+      }
    }
  } else {
    DLOG << "SetInput ---- > resize2";

--- a/mobile/src/pass/memory_optimize_super.cpp
+++ b/mobile/src/pass/memory_optimize_super.cpp
@@ -12,21 +12,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #ifdef PADDLE_MOBILE_CL
-#include "pass/memory_optimize_super.h"
+#include "pass/memory_optimize_cl.h"
 #include <algorithm>
 #include "framework/cl/cl_image.h"
 #include "framework/lod_tensor.h"
 namespace paddle_mobile {
 namespace pass {

-void MemoryOptPassSuper::AppendBlockVars(const framework::BlockDesc *block) {
+void MemoryOptPassCl::AppendBlockVars(const framework::BlockDesc *block) {
  // block_vars_.clear();
  for (const auto var : block->Vars()) {
    block_vars_[var->Name()] = var.get();
  }
 }

-bool MemoryOptPassSuper::IsPersistable(const std::string name) {
+bool MemoryOptPassCl::IsPersistable(const std::string name) {
  const auto it = block_vars_.find(name);
  if (it != block_vars_.end()) {
    return it->second->Persistable();
@@ -34,7 +34,7 @@ bool MemoryOptPassSuper::IsPersistable(const std::string name) {
  return false;
 }

-ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) {
+ClVarNode *MemoryOptPassCl::CreateNode(const std::string name) {
  auto it = created_nodes_.find(name);
  if (it != created_nodes_.end()) {
    ++(it->second->count);
@@ -48,7 +48,7 @@ ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) {
  return var;
 }

-void MemoryOptPassSuper::operator()(
+void MemoryOptPassCl::operator()(
    const framework::ProgramDesc *program, framework::Scope *scope,
    MemoryOptimizationLevel memory_optimization_level,
    framework::DDim target_dims) {
@@ -82,6 +82,8 @@ void MemoryOptPassSuper::operator()(
      DLOG << "op_desc->Type(): " << op->Type();
      for (const auto &outputs : op->GetOutputs()) {
        for (const auto &output : outputs.second) {
+          // not a persistable and not a exclude one ,then add it to
+          // analysis_nodes
          if (!IsPersistable(output) &&
              std::find(exclude_var_names.begin(), exclude_var_names.end(),
                        output) == exclude_var_names.end()) {
@@ -93,6 +95,8 @@ void MemoryOptPassSuper::operator()(
      }
      for (const auto &inputs : op->GetInputs()) {
        for (const auto &input : inputs.second) {
+          // not a persistable and not a exclude one ,then add it to
+          // analysis_nodes
          if (!IsPersistable(input) &&
              std::find(exclude_var_names.begin(), exclude_var_names.end(),
                        input) == exclude_var_names.end()) {
@@ -128,6 +132,7 @@ void MemoryOptPassSuper::operator()(
        bool reused = false;
        // find out a possable reuse list
        for (auto &list : reused_nodes_) {
+          // reference count = 0 and not in fetch list
          if (list.back()->count == 0 &&
              std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(),
                        list.back()) == fetch_var_nodes.end()) {
@@ -146,60 +151,115 @@ void MemoryOptPassSuper::operator()(
      node->visited = true;
      node->count -= 1;
    }
-
    // shared data within all variables in the same reused list
    ShareData(scope, memory_optimization_level, target_dims);
  }
 }

-void MemoryOptPassSuper::ShareData(
+void MemoryOptPassCl::ShareData(
    framework::Scope *scope, MemoryOptimizationLevel memory_optimization_level,
    framework::DDim target_dims)
    const {  // shared data within all variables in the same reused list
+  cl_context context = scope->GetCLScpoe()->Context();
+  cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
+
  for (const auto &list : reused_nodes_) {
    DLOG << "\n";
    DLOG << "gpu . share memory within these variables";
-    // find max dims
-    int64_t max_numl = -1;
+    int64_t x_based_max_numl = -1;
+    int64_t y_based_max_numl = -1;
+    int64_t x_based_max_x = -1;
+    int64_t x_based_max_y = -1;
+    int64_t y_based_max_x = -1;
+    int64_t y_based_max_y = -1;

-    framework::CLImage *reuse_tensor = nullptr;
-    DLOG << "resused nodes group ----------";
+    framework::CLImage *x_based_reuse_tensor = nullptr;
+    framework::CLImage *y_based_reuse_tensor = nullptr;
    for (const auto &node : list) {
      auto *var = scope->Var(node->name);
      auto *tensor = var->template GetMutable<framework::CLImage>();
      const int64_t numl = tensor->numel();
-      if (max_numl < numl) {
-        max_numl = numl;
-        reuse_tensor = tensor;
+      auto origin_tensor_dims = tensor->dims();
+
+      PADDLE_MOBILE_ENFORCE(origin_tensor_dims.size() == 4,
+                            "tensor dims must larger than 4");
+      // for super ,hack origin dims
+      if (target_dims.size() == 4) {
+        origin_tensor_dims = {origin_tensor_dims[0], origin_tensor_dims[1],
+                              target_dims[2], target_dims[3]};
+        tensor->Resize(origin_tensor_dims);
      }
-      DLOG << node->name << " ----dims: " << tensor->dims()
-           << "----numl----: " << numl;
-    }

-    if (reuse_tensor == nullptr) {
-      return;
+      const framework::DDim &image_dims =
+          normal_converter->InitImageDimInfoWith(origin_tensor_dims);
+      int64_t image_dims_x = image_dims[0];
+      int64_t image_dims_y = image_dims[1];
+      // classify memory into two parts
+      if (image_dims_x > image_dims_y) {
+        // choose a biggest tensor for reuse
+        if (x_based_max_numl < numl) {
+          x_based_max_numl = numl;
+          x_based_reuse_tensor = tensor;
+        }
+        x_based_max_x = std::max(x_based_max_x, image_dims_x);
+        x_based_max_y = std::max(x_based_max_y, image_dims_y);
+      } else {
+        // choose a biggest tensor for reuse
+        if (y_based_max_numl < numl) {
+          y_based_max_numl = numl;
+          y_based_reuse_tensor = tensor;
+        }
+        y_based_max_x = std::max(y_based_max_x, image_dims_x);
+        y_based_max_y = std::max(y_based_max_y, image_dims_y);
+      }
    }

-    const framework::DDim &dims = reuse_tensor->dims();
-    cl_context context = scope->GetCLScpoe()->Context();
-    cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
-
-    framework::DDim reshaped_dim = framework::make_ddim(
-        {dims[0], dims[1], target_dims[2], target_dims[3]});
+    PADDLE_MOBILE_ENFORCE(
+        x_based_reuse_tensor != nullptr || y_based_reuse_tensor != nullptr,
+        "x_based_reuse_tensor and y_based_reuse_tensor can not be null at same "
+        "time");

-    DLOG << "target dims : " << target_dims;
-    DLOG << "reshaped_dim : " << reshaped_dim;
-    reuse_tensor->InitFakeSizeImage(context, command_queue, reshaped_dim,
-                                    reshaped_dim);
+    // init x based shared cl mem
+    if (x_based_reuse_tensor != nullptr) {
+      const framework::DDim &x_reuse_dims = x_based_reuse_tensor->dims();
+      x_based_reuse_tensor->InitFakeSizeImage(
+          context, command_queue, x_reuse_dims, {x_based_max_x, x_based_max_y});
+    }

+    // init y based shared cl mem
+    if (y_based_reuse_tensor != nullptr) {
+      const framework::DDim &y_reuse_dims = y_based_reuse_tensor->dims();
+      y_based_reuse_tensor->InitFakeSizeImage(
+          context, command_queue, y_reuse_dims, {y_based_max_x, y_based_max_y});
+    }
+    // share mem
    for (const auto &node : list) {
      auto *var = scope->Var(node->name);
      auto *tensor = var->template GetMutable<framework::CLImage>();
-      const framework::DDim &temp_dim = tensor->dims();
-      framework::DDim need_dims = framework::make_ddim(
-          {temp_dim[0], temp_dim[1], target_dims[2], target_dims[3]});
-      tensor->InitWithExitedMem(context, command_queue, need_dims,
-                                *reuse_tensor);
+      auto need_dims = tensor->dims();
+
+      // for super ,hack origin dims
+      if (target_dims.size() == 4) {
+        need_dims = {need_dims[0], need_dims[1], target_dims[2],
+                     target_dims[3]};
+      }
+
+      const framework::DDim &need_image_dims =
+          normal_converter->InitImageDimInfoWith(need_dims);
+      int64_t image_dims_x = need_image_dims[0];
+      int64_t image_dims_y = need_image_dims[1];
+
+      if (image_dims_x > image_dims_y) {
+        PADDLE_MOBILE_ENFORCE(x_based_reuse_tensor != nullptr,
+                              "x_based_reuse_tensor not null here");
+        tensor->InitWithExistMem(context, command_queue, need_dims,
+                                 *x_based_reuse_tensor);
+      } else {
+        PADDLE_MOBILE_ENFORCE(y_based_reuse_tensor != nullptr,
+                              "y_based_reuse_tensor not null here");
+        tensor->InitWithExistMem(context, command_queue, need_dims,
+                                 *y_based_reuse_tensor);
+      }
    }
  }
 }

--- a/mobile/src/pass/memory_optimize_super.h
+++ b/mobile/src/pass/memory_optimize_super.h
@@ -19,10 +19,12 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "framework/cl/cl_image_converter.h"
 #include "framework/lod_tensor.h"
 #include "framework/program/program.h"
 #include "pass/pass_base.h"
-// use for super resulotion  to be extend for all opencl
+
+// use for opencl
 namespace paddle_mobile {
 namespace pass {

@@ -34,19 +36,20 @@ typedef struct {

 // MemoryOptPass will analyze the program, and reuse memory between
 // variables as much as possible
-class MemoryOptPassSuper : public PassBase {
+class MemoryOptPassCl : public PassBase {
 public:
-  MemoryOptPassSuper() {}
-  virtual ~MemoryOptPassSuper() {
+  MemoryOptPassCl() {}
+  virtual ~MemoryOptPassCl() {
    for (auto &it : created_nodes_) {
      delete it.second;
    }
+    delete normal_converter;
  }

  void operator()(const framework::ProgramDesc *program,
                  framework::Scope *scope,
                  MemoryOptimizationLevel memory_optimization_level,
-                  framework::DDim dims);
+                  framework::DDim dims = {});

  void AppendBlockVars(const framework::BlockDesc *block);

@@ -63,6 +66,8 @@ class MemoryOptPassSuper : public PassBase {
  std::vector<std::vector<ClVarNode *>> reused_nodes_;
  std::unordered_map<std::string, ClVarNode *> created_nodes_;
  std::unordered_map<std::string, framework::VarDesc *> block_vars_;
+  paddle_mobile::framework::CLImageConverterNormal *normal_converter =
+      new paddle_mobile::framework::CLImageConverterNormal();
 };

 }  // namespace pass