提交 41a8af2b 编写于 作者: X xiebaiyuan 提交者: Jiaying Zhao

remove mali . && optimise memory use of super resolution (#1794)

close #1791
上级 225ec4e0
......@@ -53,7 +53,6 @@ struct DeviceType {};
typedef DeviceType<kCPU> CPU;
typedef DeviceType<kFPGA> FPGA;
typedef DeviceType<kGPU_MALI> GPU_MALI;
typedef DeviceType<kGPU_CL> GPU_CL;
//! data type
......
......@@ -145,21 +145,61 @@ class CLImage {
initialized_ = true;
DLOG << " end init cl image";
}
// create fake size cl_mem for mem share
void InitFakeSizeImage(cl_context context, cl_command_queue command_queue,
const DDim &need_dims, const DDim &real_dims) {
PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
" empty image tensor data shouldn't have value");
void InitEmptyWithImageDim(cl_context context, cl_command_queue command_queue,
const DDim &image_dims) {
DLOG << " to get image dims ";
image_dims_ = image_dims;
DLOG << " end get image dims " << image_dims_;
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
real_image_dims = normal_converter->InitImageDimInfoWith(real_dims);
real_tensor_dims = real_dims;
image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
tensor_dims_ = need_dims;
command_queue_ = command_queue;
image_converter_ = normal_converter;
cl_event_ = CLEngine::Instance()->CreateEvent(context);
initialized_ = true;
DLOG << " end init cl image";
}
void InitWithExitedMem(cl_context context, cl_command_queue command_queue,
DDim need_dims, CLImage &src) {
CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
real_image_dims = normal_converter->InitImageDimInfoWith(src.dims());
real_tensor_dims = src.dims();
image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
// InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
if (cl_image_ != src.cl_image_) {
cl_image_.reset(src.cl_image_.get());
}
tensor_dims_ = need_dims;
command_queue_ = command_queue;
image_converter_ = normal_converter;
cl_event_ = CLEngine::Instance()->CreateEvent(context);
initialized_ = true;
DLOG << " end init cl image";
}
/*! The internal of two tensors share the same memory block. */
inline CLImage &ShareHolderWith(const CLImage &src) {
PADDLE_MOBILE_ENFORCE(
src.cl_image_ != nullptr,
"Tensor holds no memory. Call Tensor::mutable_data first.")
if (cl_image_ != src.cl_image_) {
cl_image_.reset(src.cl_image_.get());
}
return *this;
}
cl_mem GetCLImage() const { return cl_image_.get(); }
const DDim &ImageDims() const { return image_dims_; }
......@@ -238,6 +278,10 @@ class CLImage {
std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
DDim tensor_dims_;
DDim image_dims_;
// real image dims usually it is same as image_dims
DDim real_image_dims;
// real tensor dims usually it is same as tensor dims
DDim real_tensor_dims;
float *tensor_data_ = nullptr;
cl_context context_;
cl_command_queue command_queue_;
......
......@@ -33,6 +33,7 @@ limitations under the License. */
#include "pass/model_obfuscate.h"
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#include "pass/memory_optimize_super.h"
#endif
namespace paddle_mobile {
......@@ -55,7 +56,7 @@ Executor<Device, T>::Executor(const Program<Device> &program,
use_optimize_(use_optimize),
lod_mode_(lod_mode),
config_(config) {
DLOG << "executor in lod mode: " << lod_mode_;
DLOG << "executor in lod mode: " << lod_mode;
Variable *variable_ptr = program_.scope->Var("batch_size");
variable_ptr->SetValue<int>(batch_size);
......@@ -805,27 +806,30 @@ void Executor<GPU_CL, float>::SetInput(const Tensor &input,
index = feed_indices_.find(var_name)->second;
}
auto *feed_var = program_.scope->Var("feed");
framework::LoDTensor *target_tensor =
framework::LoDTensor *input_tensor =
&(feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
DLOG << "config_.load_when_predict " << config_.load_when_predict;
DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
DLOG << "target_tensor->dims() " << target_tensor->dims();
DLOG << "target_tensor->IsInitialized() " << input_tensor->IsInitialized();
DLOG << "target_tensor->dims() " << input_tensor->dims();
DLOG << "input.dims() " << input.dims();
DLOG << "input_dim_last_ " << input_dim_last_;
if (config_.load_when_predict) {
if (input_dim_last_ != input.dims()) {
DLOG << "SetInput ---- > resize1";
target_tensor->Resize(input.dims());
target_tensor->mutable_data<float>();
InitNoPersistableMemory(*target_tensor);
input_tensor->Resize(input.dims());
input_tensor->mutable_data<float>();
// InitNoPersistableMemory(*input_tensor);
pass::MemoryOptPassSuper()(program_desc_.get(), program_.scope.get(),
config_.memory_optimization_level,
input.dims());
}
} else {
DLOG << "SetInput ---- > resize2";
target_tensor->Resize(input.dims());
input_tensor->Resize(input.dims());
DLOG << "SetInput ---- > ShareDataWith";
}
target_tensor->ShareDataWith(input);
input_tensor->ShareDataWith(input);
if (feed_indices_.size() == 1) {
input_dim_has_changed_ = input_dim_last_ != input.dims();
}
......@@ -1063,7 +1067,5 @@ template class Executor<FPGA, float>;
template class Executor<GPU_CL, float>;
template class Executor<GPU_MALI, float>;
} // namespace framework
} // namespace paddle_mobile
......@@ -27,6 +27,7 @@ limitations under the License. */
#include "framework/program/program.h"
#include "framework/tensor.h"
#include "framework/type_trait.h"
#include "pass/memory_optimize.h"
namespace paddle_mobile {
namespace framework {
......
......@@ -284,8 +284,6 @@ template class Loader<CPU, float>;
template class Loader<FPGA, float>;
template class Loader<GPU_MALI, float>;
template class Loader<GPU_CL, float>;
} // namespace framework
......
......@@ -148,7 +148,6 @@ void OperatorBase<Dtype>::InsertTensors() {
template class OperatorBase<CPU>;
template class OperatorBase<FPGA>;
template class OperatorBase<GPU_MALI>;
template class OperatorBase<GPU_CL>;
} // namespace framework
......
......@@ -242,8 +242,6 @@ CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
x.reset(new PaddleMobilePredictor<CPU, float>(config));
} else if (config.device == PaddleMobileConfig::kFPGA) {
x.reset(new PaddleMobilePredictor<FPGA, float>(config));
} else if (config.device == PaddleMobileConfig::kGPU_MALI) {
x.reset(new PaddleMobilePredictor<GPU_MALI, float>(config));
} else if (config.device == PaddleMobileConfig::kGPU_CL) {
x.reset(new PaddleMobilePredictor<GPU_CL, float>(config));
} else {
......
......@@ -525,7 +525,6 @@ int PaddleMobile<Device, T>::readText(
template class PaddleMobile<CPU, float>;
template class PaddleMobile<FPGA, float>;
template class PaddleMobile<GPU_MALI, float>;
template class PaddleMobile<GPU_CL, float>;
} // namespace paddle_mobile
......@@ -30,7 +30,6 @@ double PaddleTester<Device, T>::CaculatePredictTime(std::string *cl_path) {
}
template class PaddleTester<CPU, float>;
template class PaddleTester<FPGA, float>;
template class PaddleTester<GPU_MALI, float>;
template class PaddleTester<GPU_CL, float>;
......
......@@ -41,37 +41,31 @@ Print &operator<<(Print &printer, const ConvParam<CPU> &conv_param) {
template class ConvParam<CPU>;
template class ConvParam<FPGA>;
template class ConvParam<GPU_MALI>;
#endif
#ifdef ELEMENTWISEADD_OP
template class ElementwiseAddParam<CPU>;
template class ElementwiseAddParam<FPGA>;
template class ElementwiseAddParam<GPU_MALI>;
#endif
#ifdef ELEMENTWISEMUL_OP
template class ElementwiseMulParam<CPU>;
template class ElementwiseMulParam<FPGA>;
template class ElementwiseMulParam<GPU_MALI>;
#endif
#ifdef MUL_OP
template class MulParam<CPU>;
template class MulParam<FPGA>;
template class MulParam<GPU_MALI>;
#endif
#ifdef CONCAT_OP
template class ConcatParam<CPU>;
template class ConcatParam<FPGA>;
template class ConcatParam<GPU_MALI>;
#endif
#ifdef LRN_OP
template class LrnParam<CPU>;
template class LrnParam<FPGA>;
template class LrnParam<GPU_MALI>;
#endif
#ifdef FUSION_CONVADD_OP
......
......@@ -84,7 +84,7 @@ void SliceOp<Dtype, T>::InferShape() const {
}
}
output->Resize(out_dims);
#ifdef PADDLE_MOBILE_CPU
#if !defined(PADDLE_MOBILE_CL) && defined(PADDLE_MOBILE_CPU)
if (axes[0] != 0) {
output->set_lod(input->lod());
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#include "pass/memory_optimize_super.h"
#include <algorithm>
#include "framework/cl/cl_image.h"
#include "framework/lod_tensor.h"
namespace paddle_mobile {
namespace pass {
void MemoryOptPassSuper::AppendBlockVars(const framework::BlockDesc *block) {
// block_vars_.clear();
for (const auto var : block->Vars()) {
block_vars_[var->Name()] = var.get();
}
}
bool MemoryOptPassSuper::IsPersistable(const std::string name) {
const auto it = block_vars_.find(name);
if (it != block_vars_.end()) {
return it->second->Persistable();
}
return false;
}
ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) {
auto it = created_nodes_.find(name);
if (it != created_nodes_.end()) {
++(it->second->count);
return it->second;
}
ClVarNode *var = new ClVarNode;
var->name = name;
var->count = 1;
var->visited = false;
created_nodes_[name] = var;
return var;
}
void MemoryOptPassSuper::operator()(
const framework::ProgramDesc *program, framework::Scope *scope,
MemoryOptimizationLevel memory_optimization_level,
framework::DDim target_dims) {
const auto &blocks = program->Blocks();
for (const auto &block : blocks) {
// access all variables in each block
AppendBlockVars(block.get());
reused_nodes_.clear();
// collect all not persistable variables, and accumulate
// it's reference count
std::stack<ClVarNode *> empty_var_nodes;
analysis_nodes_.swap(empty_var_nodes);
std::vector<std::string> exclude_var_names;
for (const auto &op : block->Ops()) {
for (const auto &inputs : op->GetInputs()) {
for (const auto &input : inputs.second) {
if (!IsPersistable(input)) {
if (memory_optimization_level == MemoryOptimizationWithoutFeeds) {
if (op->Type() == "feed") {
exclude_var_names.push_back(input);
}
}
}
}
}
}
std::vector<ClVarNode *> fetch_var_nodes;
for (const auto &op : block->Ops()) {
DLOG << "op_desc->Type(): " << op->Type();
for (const auto &outputs : op->GetOutputs()) {
for (const auto &output : outputs.second) {
if (!IsPersistable(output) &&
std::find(exclude_var_names.begin(), exclude_var_names.end(),
output) == exclude_var_names.end()) {
DLOG << "output: " << output;
ClVarNode *node = CreateNode(output);
analysis_nodes_.push(node);
}
}
}
for (const auto &inputs : op->GetInputs()) {
for (const auto &input : inputs.second) {
if (!IsPersistable(input) &&
std::find(exclude_var_names.begin(), exclude_var_names.end(),
input) == exclude_var_names.end()) {
DLOG << "input: " << input;
ClVarNode *node = CreateNode(input);
analysis_nodes_.push(node);
if (op->Type() == "fetch") {
fetch_var_nodes.push_back(node);
}
}
}
}
for (const auto &outputs : op->GetOutputs()) {
for (const auto &output : outputs.second) {
if (!IsPersistable(output) &&
std::find(exclude_var_names.begin(), exclude_var_names.end(),
output) == exclude_var_names.end()) {
DLOG << "output: " << output;
ClVarNode *node = CreateNode(output);
analysis_nodes_.push(node);
}
}
}
}
// apply optimize
while (!analysis_nodes_.empty()) {
auto *node = analysis_nodes_.top();
analysis_nodes_.pop();
// only not visited node can reuse memory between other nodes
// with 0 count which indicate they will not be used any more
if (!node->visited) {
bool reused = false;
// find out a possable reuse list
for (auto &list : reused_nodes_) {
if (list.back()->count == 0 &&
std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(),
list.back()) == fetch_var_nodes.end()) {
list.push_back(node);
reused = true;
break;
}
}
// create new list if can't find a reused list
if (!reused) {
std::vector<ClVarNode *> list;
list.push_back(node);
reused_nodes_.push_back(std::move(list));
}
}
node->visited = true;
node->count -= 1;
}
// shared data within all variables in the same reused list
ShareData(scope, memory_optimization_level, target_dims);
}
}
void MemoryOptPassSuper::ShareData(
framework::Scope *scope, MemoryOptimizationLevel memory_optimization_level,
framework::DDim target_dims)
const { // shared data within all variables in the same reused list
for (const auto &list : reused_nodes_) {
DLOG << "\n";
DLOG << "gpu . share memory within these variables";
// find max dims
int64_t max_numl = -1;
framework::CLImage *reuse_tensor = nullptr;
DLOG << "resused nodes group ----------";
for (const auto &node : list) {
auto *var = scope->Var(node->name);
auto *tensor = var->template GetMutable<framework::CLImage>();
const int64_t numl = tensor->numel();
if (max_numl < numl) {
max_numl = numl;
reuse_tensor = tensor;
}
DLOG << node->name << " ----dims: " << tensor->dims()
<< "----numl----: " << numl;
}
if (reuse_tensor == nullptr) {
return;
}
const framework::DDim &dims = reuse_tensor->dims();
cl_context context = scope->GetCLScpoe()->Context();
cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
framework::DDim reshaped_dim = framework::make_ddim(
{dims[0], dims[1], target_dims[2], target_dims[3]});
DLOG << "target dims : " << target_dims;
DLOG << "reshaped_dim : " << reshaped_dim;
reuse_tensor->InitFakeSizeImage(context, command_queue, reshaped_dim,
reshaped_dim);
for (const auto &node : list) {
auto *var = scope->Var(node->name);
auto *tensor = var->template GetMutable<framework::CLImage>();
const framework::DDim &temp_dim = tensor->dims();
framework::DDim need_dims = framework::make_ddim(
{temp_dim[0], temp_dim[1], target_dims[2], target_dims[3]});
tensor->InitWithExitedMem(context, command_queue, need_dims,
*reuse_tensor);
}
}
}
} // namespace pass
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#pragma once
#include <stack>
#include <string>
#include <unordered_map>
#include <vector>
#include "framework/lod_tensor.h"
#include "framework/program/program.h"
#include "pass/pass_base.h"
// use for super resulotion to be extend for all opencl
namespace paddle_mobile {
namespace pass {
typedef struct {
std::string name; // variable name
int count; // reference count
bool visited;
} ClVarNode;
// MemoryOptPass will analyze the program, and reuse memory between
// variables as much as possible
class MemoryOptPassSuper : public PassBase {
public:
MemoryOptPassSuper() {}
virtual ~MemoryOptPassSuper() {
for (auto &it : created_nodes_) {
delete it.second;
}
}
void operator()(const framework::ProgramDesc *program,
framework::Scope *scope,
MemoryOptimizationLevel memory_optimization_level,
framework::DDim dims);
void AppendBlockVars(const framework::BlockDesc *block);
bool IsPersistable(const std::string name);
ClVarNode *CreateNode(const std::string name);
void ShareData(framework::Scope *scope,
MemoryOptimizationLevel memory_optimization_level,
framework::DDim dims) const;
private:
std::stack<ClVarNode *> analysis_nodes_;
std::vector<std::vector<ClVarNode *>> reused_nodes_;
std::unordered_map<std::string, ClVarNode *> created_nodes_;
std::unordered_map<std::string, framework::VarDesc *> block_vars_;
};
} // namespace pass
} // namespace paddle_mobile
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册