Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
5506defe
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5506defe
编写于
10月 09, 2019
作者:
X
xiebaiyuan
提交者:
GitHub
10月 09, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
paddle mobile runtime cl memory optimise. test=develop (#2160)
上级
ebae97f4
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
154 addition
and
62 deletion
+154
-62
mobile/src/framework/cl/cl_image.h
mobile/src/framework/cl/cl_image.h
+34
-18
mobile/src/framework/executor.cpp
mobile/src/framework/executor.cpp
+16
-5
mobile/src/pass/memory_optimize_cl.cpp
mobile/src/pass/memory_optimize_cl.cpp
+94
-34
mobile/src/pass/memory_optimize_cl.h
mobile/src/pass/memory_optimize_cl.h
+10
-5
未找到文件。
mobile/src/framework/cl/cl_image.h
浏览文件 @
5506defe
...
...
@@ -146,20 +146,26 @@ class CLImage {
initialized_
=
true
;
DLOG
<<
" end init cl image"
;
}
// create fake size cl_mem for mem share
/**
* create fake size cl_mem for mem share
*/
void
InitFakeSizeImage
(
cl_context
context
,
cl_command_queue
command_queue
,
const
DDim
&
need_dims
,
const
DDim
&
real_dims
)
{
const
DDim
&
need_dims
,
const
DDim
&
real_
image_
dims
)
{
PADDLE_MOBILE_ENFORCE
(
tensor_data_
==
nullptr
,
" empty image tensor data shouldn't have value"
);
CLImageConverterNormal
*
normal_converter
=
new
CLImageConverterNormal
();
real_image_dims
=
normal_converter
->
InitImageDimInfoWith
(
real_dims
)
;
real_tensor_dims
=
real_dims
;
// use real image dims to create mem
real_image_dims
_
=
real_image_dims
;
InitCLImage
(
context
,
real_image_dims_
[
0
],
real_image_dims_
[
1
],
nullptr
)
;
// cheat cl_image they got what they wanted
image_dims_
=
normal_converter
->
InitImageDimInfoWith
(
need_dims
);
InitCLImage
(
context
,
image_dims_
[
0
],
image_dims_
[
1
],
nullptr
);
DLOG
<<
"InitFakeSizeImage ... "
;
DLOG
<<
"real_image_dims: "
<<
real_image_dims_
;
DLOG
<<
"image_dims_: "
<<
image_dims_
;
PADDLE_MOBILE_ENFORCE
(
real_image_dims_
[
0
]
>=
image_dims_
[
0
]
&&
real_image_dims_
[
1
]
>=
image_dims_
[
1
],
"real image is not enough"
);
tensor_dims_
=
need_dims
;
command_queue_
=
command_queue
;
image_converter_
=
normal_converter
;
...
...
@@ -167,16 +173,28 @@ class CLImage {
initialized_
=
true
;
DLOG
<<
" end init cl image"
;
}
void
InitWithExitedMem
(
cl_context
context
,
cl_command_queue
command_queue
,
DDim
need_dims
,
const
CLImage
&
src
)
{
/**
* init cl mem with a exist cl mem
*/
void
InitWithExistMem
(
cl_context
context
,
cl_command_queue
command_queue
,
DDim
need_dims
,
CLImage
&
src
)
{
CLImageConverterNormal
*
normal_converter
=
new
CLImageConverterNormal
();
real_image_dims
=
normal_converter
->
InitImageDimInfoWith
(
src
.
dims
());
real_tensor_dims
=
src
.
dims
();
real_image_dims_
=
src
.
real_image_dims_
;
image_dims_
=
normal_converter
->
InitImageDimInfoWith
(
need_dims
);
// InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
DLOG
<<
"InitWithExistMem ... "
;
DLOG
<<
"real_image_dims: "
<<
real_image_dims_
;
DLOG
<<
"image_dims_: "
<<
image_dims_
;
// PADDLE_MOBILE_ENFORCE(real_image_dims[0] >= image_dims_[0] &&
// real_image_dims[1] >= image_dims_[1],
// "real image is not enough!");
if
(
real_image_dims_
[
0
]
<
image_dims_
[
0
]
||
real_image_dims_
[
1
]
<
image_dims_
[
1
])
{
DLOG
<<
"real image is not enough!"
;
DLOG
<<
"real_image_dims: "
<<
real_image_dims_
;
DLOG
<<
"image_dims_: "
<<
image_dims_
;
}
if
(
cl_image_
!=
src
.
cl_image_
)
{
cl_image_
.
reset
(
src
.
cl_image_
.
get
());
}
...
...
@@ -289,9 +307,7 @@ class CLImage {
DDim
tensor_dims_
;
DDim
image_dims_
;
// real image dims usually it is same as image_dims
DDim
real_image_dims
;
// real tensor dims usually it is same as tensor dims
DDim
real_tensor_dims
;
DDim
real_image_dims_
;
float
*
tensor_data_
=
nullptr
;
cl_context
context_
;
cl_command_queue
command_queue_
;
...
...
mobile/src/framework/executor.cpp
浏览文件 @
5506defe
...
...
@@ -33,7 +33,7 @@ limitations under the License. */
#include "pass/model_obfuscate.h"
#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#include "pass/memory_optimize_
super
.h"
#include "pass/memory_optimize_
cl
.h"
#endif
namespace
paddle_mobile
{
...
...
@@ -126,6 +126,14 @@ Executor<Device, T>::Executor(const Program<Device> &program,
printf
(
"================[ op init profile ]==================
\n
"
);
PrintProfile
(
profile
);
#endif
#ifdef PADDLE_MOBILE_CL
if
(
!
config
.
load_when_predict
&&
!
lod_mode
&&
config_
.
memory_optimization_level
!=
NoMemoryOptimization
)
{
pass
::
MemoryOptPassCl
()(
program_desc_
.
get
(),
program_
.
scope
.
get
(),
config_
.
memory_optimization_level
);
}
#endif
}
template
<
typename
Device
,
typename
T
>
...
...
@@ -853,10 +861,13 @@ void Executor<GPU_CL, float>::SetInput(const Tensor &input,
DLOG
<<
"SetInput ---- > resize1"
;
input_tensor
->
Resize
(
input
.
dims
());
input_tensor
->
mutable_data
<
float
>
();
// InitNoPersistableMemory(*input_tensor);
pass
::
MemoryOptPassSuper
()(
program_desc_
.
get
(),
program_
.
scope
.
get
(),
config_
.
memory_optimization_level
,
input
.
dims
());
if
(
config_
.
memory_optimization_level
==
NoMemoryOptimization
)
{
InitNoPersistableMemory
(
*
input_tensor
);
}
else
{
pass
::
MemoryOptPassCl
()(
program_desc_
.
get
(),
program_
.
scope
.
get
(),
config_
.
memory_optimization_level
,
input
.
dims
());
}
}
}
else
{
DLOG
<<
"SetInput ---- > resize2"
;
...
...
mobile/src/pass/memory_optimize_
super
.cpp
→
mobile/src/pass/memory_optimize_
cl
.cpp
浏览文件 @
5506defe
...
...
@@ -12,21 +12,21 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_MOBILE_CL
#include "pass/memory_optimize_
super
.h"
#include "pass/memory_optimize_
cl
.h"
#include <algorithm>
#include "framework/cl/cl_image.h"
#include "framework/lod_tensor.h"
namespace
paddle_mobile
{
namespace
pass
{
void
MemoryOptPass
Super
::
AppendBlockVars
(
const
framework
::
BlockDesc
*
block
)
{
void
MemoryOptPass
Cl
::
AppendBlockVars
(
const
framework
::
BlockDesc
*
block
)
{
// block_vars_.clear();
for
(
const
auto
var
:
block
->
Vars
())
{
block_vars_
[
var
->
Name
()]
=
var
.
get
();
}
}
bool
MemoryOptPass
Super
::
IsPersistable
(
const
std
::
string
name
)
{
bool
MemoryOptPass
Cl
::
IsPersistable
(
const
std
::
string
name
)
{
const
auto
it
=
block_vars_
.
find
(
name
);
if
(
it
!=
block_vars_
.
end
())
{
return
it
->
second
->
Persistable
();
...
...
@@ -34,7 +34,7 @@ bool MemoryOptPassSuper::IsPersistable(const std::string name) {
return
false
;
}
ClVarNode
*
MemoryOptPass
Super
::
CreateNode
(
const
std
::
string
name
)
{
ClVarNode
*
MemoryOptPass
Cl
::
CreateNode
(
const
std
::
string
name
)
{
auto
it
=
created_nodes_
.
find
(
name
);
if
(
it
!=
created_nodes_
.
end
())
{
++
(
it
->
second
->
count
);
...
...
@@ -48,7 +48,7 @@ ClVarNode *MemoryOptPassSuper::CreateNode(const std::string name) {
return
var
;
}
void
MemoryOptPass
Super
::
operator
()(
void
MemoryOptPass
Cl
::
operator
()(
const
framework
::
ProgramDesc
*
program
,
framework
::
Scope
*
scope
,
MemoryOptimizationLevel
memory_optimization_level
,
framework
::
DDim
target_dims
)
{
...
...
@@ -82,6 +82,8 @@ void MemoryOptPassSuper::operator()(
DLOG
<<
"op_desc->Type(): "
<<
op
->
Type
();
for
(
const
auto
&
outputs
:
op
->
GetOutputs
())
{
for
(
const
auto
&
output
:
outputs
.
second
)
{
// not a persistable and not a exclude one ,then add it to
// analysis_nodes
if
(
!
IsPersistable
(
output
)
&&
std
::
find
(
exclude_var_names
.
begin
(),
exclude_var_names
.
end
(),
output
)
==
exclude_var_names
.
end
())
{
...
...
@@ -93,6 +95,8 @@ void MemoryOptPassSuper::operator()(
}
for
(
const
auto
&
inputs
:
op
->
GetInputs
())
{
for
(
const
auto
&
input
:
inputs
.
second
)
{
// not a persistable and not a exclude one ,then add it to
// analysis_nodes
if
(
!
IsPersistable
(
input
)
&&
std
::
find
(
exclude_var_names
.
begin
(),
exclude_var_names
.
end
(),
input
)
==
exclude_var_names
.
end
())
{
...
...
@@ -128,6 +132,7 @@ void MemoryOptPassSuper::operator()(
bool
reused
=
false
;
// find out a possable reuse list
for
(
auto
&
list
:
reused_nodes_
)
{
// reference count = 0 and not in fetch list
if
(
list
.
back
()
->
count
==
0
&&
std
::
find
(
fetch_var_nodes
.
begin
(),
fetch_var_nodes
.
end
(),
list
.
back
())
==
fetch_var_nodes
.
end
())
{
...
...
@@ -146,60 +151,115 @@ void MemoryOptPassSuper::operator()(
node
->
visited
=
true
;
node
->
count
-=
1
;
}
// shared data within all variables in the same reused list
ShareData
(
scope
,
memory_optimization_level
,
target_dims
);
}
}
void
MemoryOptPass
Super
::
ShareData
(
void
MemoryOptPass
Cl
::
ShareData
(
framework
::
Scope
*
scope
,
MemoryOptimizationLevel
memory_optimization_level
,
framework
::
DDim
target_dims
)
const
{
// shared data within all variables in the same reused list
cl_context
context
=
scope
->
GetCLScpoe
()
->
Context
();
cl_command_queue
command_queue
=
scope
->
GetCLScpoe
()
->
CommandQueue
();
for
(
const
auto
&
list
:
reused_nodes_
)
{
DLOG
<<
"
\n
"
;
DLOG
<<
"gpu . share memory within these variables"
;
// find max dims
int64_t
max_numl
=
-
1
;
int64_t
x_based_max_numl
=
-
1
;
int64_t
y_based_max_numl
=
-
1
;
int64_t
x_based_max_x
=
-
1
;
int64_t
x_based_max_y
=
-
1
;
int64_t
y_based_max_x
=
-
1
;
int64_t
y_based_max_y
=
-
1
;
framework
::
CLImage
*
reuse_tensor
=
nullptr
;
DLOG
<<
"resused nodes group ----------"
;
framework
::
CLImage
*
x_based_
reuse_tensor
=
nullptr
;
framework
::
CLImage
*
y_based_reuse_tensor
=
nullptr
;
for
(
const
auto
&
node
:
list
)
{
auto
*
var
=
scope
->
Var
(
node
->
name
);
auto
*
tensor
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
const
int64_t
numl
=
tensor
->
numel
();
if
(
max_numl
<
numl
)
{
max_numl
=
numl
;
reuse_tensor
=
tensor
;
auto
origin_tensor_dims
=
tensor
->
dims
();
PADDLE_MOBILE_ENFORCE
(
origin_tensor_dims
.
size
()
==
4
,
"tensor dims must larger than 4"
);
// for super ,hack origin dims
if
(
target_dims
.
size
()
==
4
)
{
origin_tensor_dims
=
{
origin_tensor_dims
[
0
],
origin_tensor_dims
[
1
],
target_dims
[
2
],
target_dims
[
3
]};
tensor
->
Resize
(
origin_tensor_dims
);
}
DLOG
<<
node
->
name
<<
" ----dims: "
<<
tensor
->
dims
()
<<
"----numl----: "
<<
numl
;
}
if
(
reuse_tensor
==
nullptr
)
{
return
;
const
framework
::
DDim
&
image_dims
=
normal_converter
->
InitImageDimInfoWith
(
origin_tensor_dims
);
int64_t
image_dims_x
=
image_dims
[
0
];
int64_t
image_dims_y
=
image_dims
[
1
];
// classify memory into two parts
if
(
image_dims_x
>
image_dims_y
)
{
// choose a biggest tensor for reuse
if
(
x_based_max_numl
<
numl
)
{
x_based_max_numl
=
numl
;
x_based_reuse_tensor
=
tensor
;
}
x_based_max_x
=
std
::
max
(
x_based_max_x
,
image_dims_x
);
x_based_max_y
=
std
::
max
(
x_based_max_y
,
image_dims_y
);
}
else
{
// choose a biggest tensor for reuse
if
(
y_based_max_numl
<
numl
)
{
y_based_max_numl
=
numl
;
y_based_reuse_tensor
=
tensor
;
}
y_based_max_x
=
std
::
max
(
y_based_max_x
,
image_dims_x
);
y_based_max_y
=
std
::
max
(
y_based_max_y
,
image_dims_y
);
}
}
const
framework
::
DDim
&
dims
=
reuse_tensor
->
dims
();
cl_context
context
=
scope
->
GetCLScpoe
()
->
Context
();
cl_command_queue
command_queue
=
scope
->
GetCLScpoe
()
->
CommandQueue
();
framework
::
DDim
reshaped_dim
=
framework
::
make_ddim
(
{
dims
[
0
],
dims
[
1
],
target_dims
[
2
],
target_dims
[
3
]});
PADDLE_MOBILE_ENFORCE
(
x_based_reuse_tensor
!=
nullptr
||
y_based_reuse_tensor
!=
nullptr
,
"x_based_reuse_tensor and y_based_reuse_tensor can not be null at same "
"time"
);
DLOG
<<
"target dims : "
<<
target_dims
;
DLOG
<<
"reshaped_dim : "
<<
reshaped_dim
;
reuse_tensor
->
InitFakeSizeImage
(
context
,
command_queue
,
reshaped_dim
,
reshaped_dim
);
// init x based shared cl mem
if
(
x_based_reuse_tensor
!=
nullptr
)
{
const
framework
::
DDim
&
x_reuse_dims
=
x_based_reuse_tensor
->
dims
();
x_based_reuse_tensor
->
InitFakeSizeImage
(
context
,
command_queue
,
x_reuse_dims
,
{
x_based_max_x
,
x_based_max_y
});
}
// init y based shared cl mem
if
(
y_based_reuse_tensor
!=
nullptr
)
{
const
framework
::
DDim
&
y_reuse_dims
=
y_based_reuse_tensor
->
dims
();
y_based_reuse_tensor
->
InitFakeSizeImage
(
context
,
command_queue
,
y_reuse_dims
,
{
y_based_max_x
,
y_based_max_y
});
}
// share mem
for
(
const
auto
&
node
:
list
)
{
auto
*
var
=
scope
->
Var
(
node
->
name
);
auto
*
tensor
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
const
framework
::
DDim
&
temp_dim
=
tensor
->
dims
();
framework
::
DDim
need_dims
=
framework
::
make_ddim
(
{
temp_dim
[
0
],
temp_dim
[
1
],
target_dims
[
2
],
target_dims
[
3
]});
tensor
->
InitWithExitedMem
(
context
,
command_queue
,
need_dims
,
*
reuse_tensor
);
auto
need_dims
=
tensor
->
dims
();
// for super ,hack origin dims
if
(
target_dims
.
size
()
==
4
)
{
need_dims
=
{
need_dims
[
0
],
need_dims
[
1
],
target_dims
[
2
],
target_dims
[
3
]};
}
const
framework
::
DDim
&
need_image_dims
=
normal_converter
->
InitImageDimInfoWith
(
need_dims
);
int64_t
image_dims_x
=
need_image_dims
[
0
];
int64_t
image_dims_y
=
need_image_dims
[
1
];
if
(
image_dims_x
>
image_dims_y
)
{
PADDLE_MOBILE_ENFORCE
(
x_based_reuse_tensor
!=
nullptr
,
"x_based_reuse_tensor not null here"
);
tensor
->
InitWithExistMem
(
context
,
command_queue
,
need_dims
,
*
x_based_reuse_tensor
);
}
else
{
PADDLE_MOBILE_ENFORCE
(
y_based_reuse_tensor
!=
nullptr
,
"y_based_reuse_tensor not null here"
);
tensor
->
InitWithExistMem
(
context
,
command_queue
,
need_dims
,
*
y_based_reuse_tensor
);
}
}
}
}
...
...
mobile/src/pass/memory_optimize_
super
.h
→
mobile/src/pass/memory_optimize_
cl
.h
浏览文件 @
5506defe
...
...
@@ -19,10 +19,12 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <vector>
#include "framework/cl/cl_image_converter.h"
#include "framework/lod_tensor.h"
#include "framework/program/program.h"
#include "pass/pass_base.h"
// use for super resulotion to be extend for all opencl
// use for opencl
namespace
paddle_mobile
{
namespace
pass
{
...
...
@@ -34,19 +36,20 @@ typedef struct {
// MemoryOptPass will analyze the program, and reuse memory between
// variables as much as possible
class
MemoryOptPass
Super
:
public
PassBase
{
class
MemoryOptPass
Cl
:
public
PassBase
{
public:
MemoryOptPass
Super
()
{}
virtual
~
MemoryOptPass
Super
()
{
MemoryOptPass
Cl
()
{}
virtual
~
MemoryOptPass
Cl
()
{
for
(
auto
&
it
:
created_nodes_
)
{
delete
it
.
second
;
}
delete
normal_converter
;
}
void
operator
()(
const
framework
::
ProgramDesc
*
program
,
framework
::
Scope
*
scope
,
MemoryOptimizationLevel
memory_optimization_level
,
framework
::
DDim
dims
);
framework
::
DDim
dims
=
{}
);
void
AppendBlockVars
(
const
framework
::
BlockDesc
*
block
);
...
...
@@ -63,6 +66,8 @@ class MemoryOptPassSuper : public PassBase {
std
::
vector
<
std
::
vector
<
ClVarNode
*>>
reused_nodes_
;
std
::
unordered_map
<
std
::
string
,
ClVarNode
*>
created_nodes_
;
std
::
unordered_map
<
std
::
string
,
framework
::
VarDesc
*>
block_vars_
;
paddle_mobile
::
framework
::
CLImageConverterNormal
*
normal_converter
=
new
paddle_mobile
::
framework
::
CLImageConverterNormal
();
};
}
// namespace pass
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录