diff --git a/src/common/types.h b/src/common/types.h index ee2ff998ecd3dd9af5ec78b35b595f5552f9c042..744855e0fd52f64f478a43794ad92e0e50408f26 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -53,7 +53,6 @@ struct DeviceType {}; typedef DeviceType CPU; typedef DeviceType FPGA; -typedef DeviceType GPU_MALI; typedef DeviceType GPU_CL; //! data type diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index 5975d60edfa86d1dd396d26c2888600906955db8..a375f611a8fb0d1f5cda7a28fa557312fee3fb26 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -145,21 +145,61 @@ class CLImage { initialized_ = true; DLOG << " end init cl image"; } + // create fake size cl_mem for mem share + void InitFakeSizeImage(cl_context context, cl_command_queue command_queue, + const DDim &need_dims, const DDim &real_dims) { + PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr, + " empty image tensor data shouldn't have value"); - void InitEmptyWithImageDim(cl_context context, cl_command_queue command_queue, - const DDim &image_dims) { - DLOG << " to get image dims "; - image_dims_ = image_dims; - DLOG << " end get image dims " << image_dims_; + CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); + + real_image_dims = normal_converter->InitImageDimInfoWith(real_dims); + real_tensor_dims = real_dims; + image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); + tensor_dims_ = need_dims; + command_queue_ = command_queue; + image_converter_ = normal_converter; + cl_event_ = CLEngine::Instance()->CreateEvent(context); + initialized_ = true; + DLOG << " end init cl image"; + } + + void InitWithExitedMem(cl_context context, cl_command_queue command_queue, + DDim need_dims, CLImage &src) { + CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); + + real_image_dims = normal_converter->InitImageDimInfoWith(src.dims()); + real_tensor_dims = src.dims(); + + image_dims_ = normal_converter->InitImageDimInfoWith(need_dims); + // InitCLImage(context, image_dims_[0], image_dims_[1], nullptr); + if (cl_image_ != src.cl_image_) { + cl_image_.reset(src.cl_image_.get()); + } + + tensor_dims_ = need_dims; command_queue_ = command_queue; + image_converter_ = normal_converter; cl_event_ = CLEngine::Instance()->CreateEvent(context); initialized_ = true; DLOG << " end init cl image"; } + /*! The internal of two tensors share the same memory block. */ + inline CLImage &ShareHolderWith(const CLImage &src) { + PADDLE_MOBILE_ENFORCE( + src.cl_image_ != nullptr, + "Tensor holds no memory. Call Tensor::mutable_data first.") + + if (cl_image_ != src.cl_image_) { + cl_image_.reset(src.cl_image_.get()); + } + return *this; + } + cl_mem GetCLImage() const { return cl_image_.get(); } const DDim &ImageDims() const { return image_dims_; } @@ -238,6 +278,10 @@ class CLImage { std::unique_ptr<_cl_event, CLEventDeleter> cl_event_; DDim tensor_dims_; DDim image_dims_; + // real image dims usually it is same as image_dims + DDim real_image_dims; + // real tensor dims usually it is same as tensor dims + DDim real_tensor_dims; float *tensor_data_ = nullptr; cl_context context_; cl_command_queue command_queue_; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 210360f4ca75c7ff116e2fb9bc0a553383486e23..d817b0c1e9a77dee6bde6c47cf9b5f614cad13bc 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -33,6 +33,7 @@ limitations under the License. */ #include "pass/model_obfuscate.h" #ifdef PADDLE_MOBILE_CL #include "framework/cl/cl_image.h" +#include "pass/memory_optimize_super.h" #endif namespace paddle_mobile { @@ -55,7 +56,7 @@ Executor::Executor(const Program &program, use_optimize_(use_optimize), lod_mode_(lod_mode), config_(config) { - DLOG << "executor in lod mode: " << lod_mode_; + DLOG << "executor in lod mode: " << lod_mode; Variable *variable_ptr = program_.scope->Var("batch_size"); variable_ptr->SetValue(batch_size); @@ -805,27 +806,30 @@ void Executor::SetInput(const Tensor &input, index = feed_indices_.find(var_name)->second; } auto *feed_var = program_.scope->Var("feed"); - framework::LoDTensor *target_tensor = + framework::LoDTensor *input_tensor = &(feed_var->template GetMutable()->at(index)); DLOG << "config_.load_when_predict " << config_.load_when_predict; - DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized(); - DLOG << "target_tensor->dims() " << target_tensor->dims(); + DLOG << "target_tensor->IsInitialized() " << input_tensor->IsInitialized(); + DLOG << "target_tensor->dims() " << input_tensor->dims(); DLOG << "input.dims() " << input.dims(); DLOG << "input_dim_last_ " << input_dim_last_; if (config_.load_when_predict) { if (input_dim_last_ != input.dims()) { DLOG << "SetInput ---- > resize1"; - target_tensor->Resize(input.dims()); - target_tensor->mutable_data(); - InitNoPersistableMemory(*target_tensor); + input_tensor->Resize(input.dims()); + input_tensor->mutable_data(); + // InitNoPersistableMemory(*input_tensor); + pass::MemoryOptPassSuper()(program_desc_.get(), program_.scope.get(), + config_.memory_optimization_level, + input.dims()); } } else { DLOG << "SetInput ---- > resize2"; - target_tensor->Resize(input.dims()); + input_tensor->Resize(input.dims()); DLOG << "SetInput ---- > ShareDataWith"; } - target_tensor->ShareDataWith(input); + input_tensor->ShareDataWith(input); if (feed_indices_.size() == 1) { input_dim_has_changed_ = input_dim_last_ != input.dims(); } @@ -1063,7 +1067,5 @@ template class Executor; template class Executor; -template class Executor; - } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/executor.h b/src/framework/executor.h index 81b37734d673304c2a303ca9024aea9fb5c543d5..d7dcb17620175b1c7ae8d3334a5de11ddc8b760a 100644 --- a/src/framework/executor.h +++ b/src/framework/executor.h @@ -27,6 +27,7 @@ limitations under the License. */ #include "framework/program/program.h" #include "framework/tensor.h" #include "framework/type_trait.h" +#include "pass/memory_optimize.h" namespace paddle_mobile { namespace framework { diff --git a/src/framework/loader.cpp b/src/framework/loader.cpp index 99674307aae2b105ca1e125dbbb959f0f5301c6d..4350fda969a01f7d672f5aebdc9a77390e175b9b 100644 --- a/src/framework/loader.cpp +++ b/src/framework/loader.cpp @@ -284,8 +284,6 @@ template class Loader; template class Loader; -template class Loader; - template class Loader; } // namespace framework diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index c1d7fe351fcb4121546f07a3f26f10d784c1baa8..402512c7237be0ca26470361cc16369bd97f7758 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -148,7 +148,6 @@ void OperatorBase::InsertTensors() { template class OperatorBase; template class OperatorBase; -template class OperatorBase; template class OperatorBase; } // namespace framework diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index 3bf970294d8db2cd64e351163d88ac89fb6343d0..eb8a537ca473148237fd74adeb2f1fc84cb8e2ef 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -242,8 +242,6 @@ CreatePaddlePredictor( x.reset(new PaddleMobilePredictor(config)); } else if (config.device == PaddleMobileConfig::kFPGA) { x.reset(new PaddleMobilePredictor(config)); - } else if (config.device == PaddleMobileConfig::kGPU_MALI) { - x.reset(new PaddleMobilePredictor(config)); } else if (config.device == PaddleMobileConfig::kGPU_CL) { x.reset(new PaddleMobilePredictor(config)); } else { diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index b4e24dfd6185fbf092254d2860f7aa130412964f..c08d80132729eb362dc6cff600d0e44c95f93e91 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -525,7 +525,6 @@ int PaddleMobile::readText( template class PaddleMobile; template class PaddleMobile; -template class PaddleMobile; template class PaddleMobile; } // namespace paddle_mobile diff --git a/src/io/paddle_test_inference_api.cpp b/src/io/paddle_test_inference_api.cpp index 33a902f390e06380096e49c58fe4e30181129dc9..d0c6c48c2006cd710eadb6160f49702d98c2e031 100644 --- a/src/io/paddle_test_inference_api.cpp +++ b/src/io/paddle_test_inference_api.cpp @@ -30,7 +30,6 @@ double PaddleTester::CaculatePredictTime(std::string *cl_path) { } template class PaddleTester; template class PaddleTester; -template class PaddleTester; template class PaddleTester; diff --git a/src/operators/op_param.cpp b/src/operators/op_param.cpp index 4e008715bd3111f27c6f00847ac91024c663100d..bccff4a27425e75066cdf34301f051d24e47ae25 100644 --- a/src/operators/op_param.cpp +++ b/src/operators/op_param.cpp @@ -41,37 +41,31 @@ Print &operator<<(Print &printer, const ConvParam &conv_param) { template class ConvParam; template class ConvParam; -template class ConvParam; #endif #ifdef ELEMENTWISEADD_OP template class ElementwiseAddParam; template class ElementwiseAddParam; -template class ElementwiseAddParam; #endif #ifdef ELEMENTWISEMUL_OP template class ElementwiseMulParam; template class ElementwiseMulParam; -template class ElementwiseMulParam; #endif #ifdef MUL_OP template class MulParam; template class MulParam; -template class MulParam; #endif #ifdef CONCAT_OP template class ConcatParam; template class ConcatParam; -template class ConcatParam; #endif #ifdef LRN_OP template class LrnParam; template class LrnParam; -template class LrnParam; #endif #ifdef FUSION_CONVADD_OP diff --git a/src/operators/slice_op.cpp b/src/operators/slice_op.cpp index 58e6a47ddc8784224506557de8322a670c9dc5c2..6351d2d02890683a6ceee9f149a9a524edc01a34 100644 --- a/src/operators/slice_op.cpp +++ b/src/operators/slice_op.cpp @@ -84,7 +84,7 @@ void SliceOp::InferShape() const { } } output->Resize(out_dims); -#ifdef PADDLE_MOBILE_CPU +#if !defined(PADDLE_MOBILE_CL) && defined(PADDLE_MOBILE_CPU) if (axes[0] != 0) { output->set_lod(input->lod()); } diff --git a/src/pass/memory_optimize_super.cpp b/src/pass/memory_optimize_super.cpp new file mode 100644 index 0000000000000000000000000000000000000000..344b88b02ed915570f50a4f0eebdc9949c338ddb --- /dev/null +++ b/src/pass/memory_optimize_super.cpp @@ -0,0 +1,209 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_MOBILE_CL +#include "pass/memory_optimize_super.h" +#include +#include "framework/cl/cl_image.h" +#include "framework/lod_tensor.h" +namespace paddle_mobile { +namespace pass { + +void MemoryOptPassSuper::AppendBlockVars(const framework::BlockDesc *block) { + // block_vars_.clear(); + for (const auto var : block->Vars()) { + block_vars_[var->Name()] = var.get(); + } +} + +bool MemoryOptPassSuper::IsPersistable(const std::string name) { + const auto it = block_vars_.find(name); + if (it != block_vars_.end()) { + return it->second->Persistable(); + } + return false; +} + +ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) { + auto it = created_nodes_.find(name); + if (it != created_nodes_.end()) { + ++(it->second->count); + return it->second; + } + ClVarNode *var = new ClVarNode; + var->name = name; + var->count = 1; + var->visited = false; + created_nodes_[name] = var; + return var; +} + +void MemoryOptPassSuper::operator()( + const framework::ProgramDesc *program, framework::Scope *scope, + MemoryOptimizationLevel memory_optimization_level, + framework::DDim target_dims) { + const auto &blocks = program->Blocks(); + for (const auto &block : blocks) { + // access all variables in each block + AppendBlockVars(block.get()); + reused_nodes_.clear(); + // collect all not persistable variables, and accumulate + // it's reference count + std::stack empty_var_nodes; + analysis_nodes_.swap(empty_var_nodes); + + std::vector exclude_var_names; + for (const auto &op : block->Ops()) { + for (const auto &inputs : op->GetInputs()) { + for (const auto &input : inputs.second) { + if (!IsPersistable(input)) { + if (memory_optimization_level == MemoryOptimizationWithoutFeeds) { + if (op->Type() == "feed") { + exclude_var_names.push_back(input); + } + } + } + } + } + } + + std::vector fetch_var_nodes; + for (const auto &op : block->Ops()) { + DLOG << "op_desc->Type(): " << op->Type(); + for (const auto &outputs : op->GetOutputs()) { + for (const auto &output : outputs.second) { + if (!IsPersistable(output) && + std::find(exclude_var_names.begin(), exclude_var_names.end(), + output) == exclude_var_names.end()) { + DLOG << "output: " << output; + ClVarNode *node = CreateNode(output); + analysis_nodes_.push(node); + } + } + } + for (const auto &inputs : op->GetInputs()) { + for (const auto &input : inputs.second) { + if (!IsPersistable(input) && + std::find(exclude_var_names.begin(), exclude_var_names.end(), + input) == exclude_var_names.end()) { + DLOG << "input: " << input; + ClVarNode *node = CreateNode(input); + analysis_nodes_.push(node); + if (op->Type() == "fetch") { + fetch_var_nodes.push_back(node); + } + } + } + } + for (const auto &outputs : op->GetOutputs()) { + for (const auto &output : outputs.second) { + if (!IsPersistable(output) && + std::find(exclude_var_names.begin(), exclude_var_names.end(), + output) == exclude_var_names.end()) { + DLOG << "output: " << output; + ClVarNode *node = CreateNode(output); + analysis_nodes_.push(node); + } + } + } + } + + // apply optimize + while (!analysis_nodes_.empty()) { + auto *node = analysis_nodes_.top(); + analysis_nodes_.pop(); + // only not visited node can reuse memory between other nodes + // with 0 count which indicate they will not be used any more + if (!node->visited) { + bool reused = false; + // find out a possable reuse list + for (auto &list : reused_nodes_) { + if (list.back()->count == 0 && + std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(), + list.back()) == fetch_var_nodes.end()) { + list.push_back(node); + reused = true; + break; + } + } + // create new list if can't find a reused list + if (!reused) { + std::vector list; + list.push_back(node); + reused_nodes_.push_back(std::move(list)); + } + } + node->visited = true; + node->count -= 1; + } + + // shared data within all variables in the same reused list + ShareData(scope, memory_optimization_level, target_dims); + } +} + +void MemoryOptPassSuper::ShareData( + framework::Scope *scope, MemoryOptimizationLevel memory_optimization_level, + framework::DDim target_dims) + const { // shared data within all variables in the same reused list + for (const auto &list : reused_nodes_) { + DLOG << "\n"; + DLOG << "gpu . share memory within these variables"; + // find max dims + int64_t max_numl = -1; + + framework::CLImage *reuse_tensor = nullptr; + DLOG << "resused nodes group ----------"; + for (const auto &node : list) { + auto *var = scope->Var(node->name); + auto *tensor = var->template GetMutable(); + const int64_t numl = tensor->numel(); + if (max_numl < numl) { + max_numl = numl; + reuse_tensor = tensor; + } + DLOG << node->name << " ----dims: " << tensor->dims() + << "----numl----: " << numl; + } + + if (reuse_tensor == nullptr) { + return; + } + + const framework::DDim &dims = reuse_tensor->dims(); + cl_context context = scope->GetCLScpoe()->Context(); + cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue(); + + framework::DDim reshaped_dim = framework::make_ddim( + {dims[0], dims[1], target_dims[2], target_dims[3]}); + + DLOG << "target dims : " << target_dims; + DLOG << "reshaped_dim : " << reshaped_dim; + reuse_tensor->InitFakeSizeImage(context, command_queue, reshaped_dim, + reshaped_dim); + + for (const auto &node : list) { + auto *var = scope->Var(node->name); + auto *tensor = var->template GetMutable(); + const framework::DDim &temp_dim = tensor->dims(); + framework::DDim need_dims = framework::make_ddim( + {temp_dim[0], temp_dim[1], target_dims[2], target_dims[3]}); + tensor->InitWithExitedMem(context, command_queue, need_dims, + *reuse_tensor); + } + } +} + +} // namespace pass +} // namespace paddle_mobile +#endif diff --git a/src/pass/memory_optimize_super.h b/src/pass/memory_optimize_super.h new file mode 100644 index 0000000000000000000000000000000000000000..08af29919f99253765412a2dae81fc95d9f5e62c --- /dev/null +++ b/src/pass/memory_optimize_super.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef PADDLE_MOBILE_CL + +#pragma once + +#include +#include +#include +#include +#include "framework/lod_tensor.h" +#include "framework/program/program.h" +#include "pass/pass_base.h" +// use for super resulotion to be extend for all opencl +namespace paddle_mobile { +namespace pass { + +typedef struct { + std::string name; // variable name + int count; // reference count + bool visited; +} ClVarNode; + +// MemoryOptPass will analyze the program, and reuse memory between +// variables as much as possible +class MemoryOptPassSuper : public PassBase { + public: + MemoryOptPassSuper() {} + virtual ~MemoryOptPassSuper() { + for (auto &it : created_nodes_) { + delete it.second; + } + } + + void operator()(const framework::ProgramDesc *program, + framework::Scope *scope, + MemoryOptimizationLevel memory_optimization_level, + framework::DDim dims); + + void AppendBlockVars(const framework::BlockDesc *block); + + bool IsPersistable(const std::string name); + + ClVarNode *CreateNode(const std::string name); + + void ShareData(framework::Scope *scope, + MemoryOptimizationLevel memory_optimization_level, + framework::DDim dims) const; + + private: + std::stack analysis_nodes_; + std::vector> reused_nodes_; + std::unordered_map created_nodes_; + std::unordered_map block_vars_; +}; + +} // namespace pass +} // namespace paddle_mobile +#endif