提交 5506defe 编写于 作者: X xiebaiyuan 提交者: GitHub

paddle mobile runtime cl memory optimise. test=develop (#2160)

上级 ebae97f4
......@@ -146,20 +146,26 @@ class CLImage {
initialized_ = true;
DLOG << " end init cl image";
}
// create fake size cl_mem for mem share
/**
* create fake size cl_mem for mem share
*/
void InitFakeSizeImage(cl_context context, cl_command_queue command_queue,
const DDim &need_dims, const DDim &real_dims) {
const DDim &need_dims, const DDim &real_image_dims) {
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
real_image_dims = normal_converter->InitImageDimInfoWith(real_dims);
real_tensor_dims = real_dims;
// use real image dims to create mem
real_image_dims_ = real_image_dims;
InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr);
// cheat cl_image they got what they wanted
image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
DLOG << "InitFakeSizeImage ... ";
DLOG << "real_image_dims: " << real_image_dims_;
DLOG << "image_dims_: " << image_dims_;
PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
real_image_dims_[1] >= image_dims_[1],
"real image is not enough");
tensor_dims_ = need_dims;
command_queue_ = command_queue;
image_converter_ = normal_converter;
......@@ -167,16 +173,28 @@ class CLImage {
initialized_ = true;
DLOG << " end init cl image";
}
void InitWithExitedMem(cl_context context, cl_command_queue command_queue,
DDim need_dims, const CLImage &src) {
/**
* init cl mem with a exist cl mem
*/
void InitWithExistMem(cl_context context, cl_command_queue command_queue,
DDim need_dims, CLImage &src) {
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
real_image_dims = normal_converter->InitImageDimInfoWith(src.dims());
real_tensor_dims = src.dims();
real_image_dims_ = src.real_image_dims_;
image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
// InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
DLOG << "InitWithExistMem ... ";
DLOG << "real_image_dims: " << real_image_dims_;
DLOG << "image_dims_: " << image_dims_;
// PADDLE_MOBILE_ENFORCE(real_image_dims[0] >= image_dims_[0] &&
// real_image_dims[1] >= image_dims_[1],
// "real image is not enough!");
if (real_image_dims_[0] < image_dims_[0] ||
real_image_dims_[1] < image_dims_[1]) {
DLOG << "real image is not enough!";
DLOG << "real_image_dims: " << real_image_dims_;
DLOG << "image_dims_: " << image_dims_;
}
if (cl_image_ != src.cl_image_) {
cl_image_.reset(src.cl_image_.get());
}
......@@ -289,9 +307,7 @@ class CLImage {
DDim tensor_dims_;
DDim image_dims_;
// real image dims usually it is same as image_dims
DDim real_image_dims;
// real tensor dims usually it is same as tensor dims
DDim real_tensor_dims;
DDim real_image_dims_;
float *tensor_data_ = nullptr;
cl_context context_;
cl_command_queue command_queue_;
......
......@@ -33,7 +33,7 @@ limitations under the License. */
#include "pass/model_obfuscate.h"
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#include "pass/memory_optimize_super.h"
#include "pass/memory_optimize_cl.h"
#endif
namespace paddle_mobile {
......@@ -126,6 +126,14 @@ Executor<Device, T>::Executor(const Program<Device> &program,
printf("================[ op init profile ]==================\n");
PrintProfile(profile);
#endif
#ifdef PADDLE_MOBILE_CL
if (!config.load_when_predict && !lod_mode &&
config_.memory_optimization_level != NoMemoryOptimization) {
pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(),
config_.memory_optimization_level);
}
#endif
}
template <typename Device, typename T>
......@@ -853,10 +861,13 @@ void Executor<GPU_CL, float>::SetInput(const Tensor &input,
DLOG << "SetInput ---- > resize1";
input_tensor->Resize(input.dims());
input_tensor->mutable_data<float>();
// InitNoPersistableMemory(*input_tensor);
pass::MemoryOptPassSuper()(program_desc_.get(), program_.scope.get(),
config_.memory_optimization_level,
input.dims());
if (config_.memory_optimization_level == NoMemoryOptimization) {
InitNoPersistableMemory(*input_tensor);
} else {
pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(),
config_.memory_optimization_level,
input.dims());
}
}
} else {
DLOG << "SetInput ---- > resize2";
......
......@@ -12,21 +12,21 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#include "pass/memory_optimize_super.h"
#include "pass/memory_optimize_cl.h"
#include <algorithm>
#include "framework/cl/cl_image.h"
#include "framework/lod_tensor.h"
namespace paddle_mobile {
namespace pass {
void MemoryOptPassSuper::AppendBlockVars(const framework::BlockDesc *block) {
void MemoryOptPassCl::AppendBlockVars(const framework::BlockDesc *block) {
// block_vars_.clear();
for (const auto var : block->Vars()) {
block_vars_[var->Name()] = var.get();
}
}
bool MemoryOptPassSuper::IsPersistable(const std::string name) {
bool MemoryOptPassCl::IsPersistable(const std::string name) {
const auto it = block_vars_.find(name);
if (it != block_vars_.end()) {
return it->second->Persistable();
......@@ -34,7 +34,7 @@ bool MemoryOptPassSuper::IsPersistable(const std::string name) {
return false;
}
ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) {
ClVarNode *MemoryOptPassCl::CreateNode(const std::string name) {
auto it = created_nodes_.find(name);
if (it != created_nodes_.end()) {
++(it->second->count);
......@@ -48,7 +48,7 @@ ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) {
return var;
}
void MemoryOptPassSuper::operator()(
void MemoryOptPassCl::operator()(
const framework::ProgramDesc *program, framework::Scope *scope,
MemoryOptimizationLevel memory_optimization_level,
framework::DDim target_dims) {
......@@ -82,6 +82,8 @@ void MemoryOptPassSuper::operator()(
DLOG << "op_desc->Type(): " << op->Type();
for (const auto &outputs : op->GetOutputs()) {
for (const auto &output : outputs.second) {
// not a persistable and not a exclude one ,then add it to
// analysis_nodes
if (!IsPersistable(output) &&
std::find(exclude_var_names.begin(), exclude_var_names.end(),
output) == exclude_var_names.end()) {
......@@ -93,6 +95,8 @@ void MemoryOptPassSuper::operator()(
}
for (const auto &inputs : op->GetInputs()) {
for (const auto &input : inputs.second) {
// not a persistable and not a exclude one ,then add it to
// analysis_nodes
if (!IsPersistable(input) &&
std::find(exclude_var_names.begin(), exclude_var_names.end(),
input) == exclude_var_names.end()) {
......@@ -128,6 +132,7 @@ void MemoryOptPassSuper::operator()(
bool reused = false;
// find out a possable reuse list
for (auto &list : reused_nodes_) {
// reference count = 0 and not in fetch list
if (list.back()->count == 0 &&
std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(),
list.back()) == fetch_var_nodes.end()) {
......@@ -146,60 +151,115 @@ void MemoryOptPassSuper::operator()(
node->visited = true;
node->count -= 1;
}
// shared data within all variables in the same reused list
ShareData(scope, memory_optimization_level, target_dims);
}
}
void MemoryOptPassSuper::ShareData(
void MemoryOptPassCl::ShareData(
framework::Scope *scope, MemoryOptimizationLevel memory_optimization_level,
framework::DDim target_dims)
const { // shared data within all variables in the same reused list
cl_context context = scope->GetCLScpoe()->Context();
cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
for (const auto &list : reused_nodes_) {
DLOG << "\n";
DLOG << "gpu . share memory within these variables";
// find max dims
int64_t max_numl = -1;
int64_t x_based_max_numl = -1;
int64_t y_based_max_numl = -1;
int64_t x_based_max_x = -1;
int64_t x_based_max_y = -1;
int64_t y_based_max_x = -1;
int64_t y_based_max_y = -1;
framework::CLImage *reuse_tensor = nullptr;
DLOG << "resused nodes group ----------";
framework::CLImage *x_based_reuse_tensor = nullptr;
framework::CLImage *y_based_reuse_tensor = nullptr;
for (const auto &node : list) {
auto *var = scope->Var(node->name);
auto *tensor = var->template GetMutable<framework::CLImage>();
const int64_t numl = tensor->numel();
if (max_numl < numl) {
max_numl = numl;
reuse_tensor = tensor;
auto origin_tensor_dims = tensor->dims();
PADDLE_MOBILE_ENFORCE(origin_tensor_dims.size() == 4,
"tensor dims must larger than 4");
// for super ,hack origin dims
if (target_dims.size() == 4) {
origin_tensor_dims = {origin_tensor_dims[0], origin_tensor_dims[1],
target_dims[2], target_dims[3]};
tensor->Resize(origin_tensor_dims);
}
DLOG << node->name << " ----dims: " << tensor->dims()
<< "----numl----: " << numl;
}
if (reuse_tensor == nullptr) {
return;
const framework::DDim &image_dims =
normal_converter->InitImageDimInfoWith(origin_tensor_dims);
int64_t image_dims_x = image_dims[0];
int64_t image_dims_y = image_dims[1];
// classify memory into two parts
if (image_dims_x > image_dims_y) {
// choose a biggest tensor for reuse
if (x_based_max_numl < numl) {
x_based_max_numl = numl;
x_based_reuse_tensor = tensor;
}
x_based_max_x = std::max(x_based_max_x, image_dims_x);
x_based_max_y = std::max(x_based_max_y, image_dims_y);
} else {
// choose a biggest tensor for reuse
if (y_based_max_numl < numl) {
y_based_max_numl = numl;
y_based_reuse_tensor = tensor;
}
y_based_max_x = std::max(y_based_max_x, image_dims_x);
y_based_max_y = std::max(y_based_max_y, image_dims_y);
}
}
const framework::DDim &dims = reuse_tensor->dims();
cl_context context = scope->GetCLScpoe()->Context();
cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
framework::DDim reshaped_dim = framework::make_ddim(
{dims[0], dims[1], target_dims[2], target_dims[3]});
PADDLE_MOBILE_ENFORCE(
x_based_reuse_tensor != nullptr || y_based_reuse_tensor != nullptr,
"x_based_reuse_tensor and y_based_reuse_tensor can not be null at same "
"time");
DLOG << "target dims : " << target_dims;
DLOG << "reshaped_dim : " << reshaped_dim;
reuse_tensor->InitFakeSizeImage(context, command_queue, reshaped_dim,
reshaped_dim);
// init x based shared cl mem
if (x_based_reuse_tensor != nullptr) {
const framework::DDim &x_reuse_dims = x_based_reuse_tensor->dims();
x_based_reuse_tensor->InitFakeSizeImage(
context, command_queue, x_reuse_dims, {x_based_max_x, x_based_max_y});
}
// init y based shared cl mem
if (y_based_reuse_tensor != nullptr) {
const framework::DDim &y_reuse_dims = y_based_reuse_tensor->dims();
y_based_reuse_tensor->InitFakeSizeImage(
context, command_queue, y_reuse_dims, {y_based_max_x, y_based_max_y});
}
// share mem
for (const auto &node : list) {
auto *var = scope->Var(node->name);
auto *tensor = var->template GetMutable<framework::CLImage>();
const framework::DDim &temp_dim = tensor->dims();
framework::DDim need_dims = framework::make_ddim(
{temp_dim[0], temp_dim[1], target_dims[2], target_dims[3]});
tensor->InitWithExitedMem(context, command_queue, need_dims,
*reuse_tensor);
auto need_dims = tensor->dims();
// for super ,hack origin dims
if (target_dims.size() == 4) {
need_dims = {need_dims[0], need_dims[1], target_dims[2],
target_dims[3]};
}
const framework::DDim &need_image_dims =
normal_converter->InitImageDimInfoWith(need_dims);
int64_t image_dims_x = need_image_dims[0];
int64_t image_dims_y = need_image_dims[1];
if (image_dims_x > image_dims_y) {
PADDLE_MOBILE_ENFORCE(x_based_reuse_tensor != nullptr,
"x_based_reuse_tensor not null here");
tensor->InitWithExistMem(context, command_queue, need_dims,
*x_based_reuse_tensor);
} else {
PADDLE_MOBILE_ENFORCE(y_based_reuse_tensor != nullptr,
"y_based_reuse_tensor not null here");
tensor->InitWithExistMem(context, command_queue, need_dims,
*y_based_reuse_tensor);
}
}
}
}
......
......@@ -19,10 +19,12 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "framework/cl/cl_image_converter.h"
#include "framework/lod_tensor.h"
#include "framework/program/program.h"
#include "pass/pass_base.h"
// use for super resulotion to be extend for all opencl
// use for opencl
namespace paddle_mobile {
namespace pass {
......@@ -34,19 +36,20 @@ typedef struct {
// MemoryOptPass will analyze the program, and reuse memory between
// variables as much as possible
class MemoryOptPassSuper : public PassBase {
class MemoryOptPassCl : public PassBase {
public:
MemoryOptPassSuper() {}
virtual ~MemoryOptPassSuper() {
MemoryOptPassCl() {}
virtual ~MemoryOptPassCl() {
for (auto &it : created_nodes_) {
delete it.second;
}
delete normal_converter;
}
void operator()(const framework::ProgramDesc *program,
framework::Scope *scope,
MemoryOptimizationLevel memory_optimization_level,
framework::DDim dims);
framework::DDim dims = {});
void AppendBlockVars(const framework::BlockDesc *block);
......@@ -63,6 +66,8 @@ class MemoryOptPassSuper : public PassBase {
std::vector<std::vector<ClVarNode *>> reused_nodes_;
std::unordered_map<std::string, ClVarNode *> created_nodes_;
std::unordered_map<std::string, framework::VarDesc *> block_vars_;
paddle_mobile::framework::CLImageConverterNormal *normal_converter =
new paddle_mobile::framework::CLImageConverterNormal();
};
} // namespace pass
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册