Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
4001ca6f
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4001ca6f
编写于
10月 24, 2018
作者:
L
liuruilong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update opencl code
上级
83737c34
变更
24
隐藏空白更改
内联
并排
Showing
24 changed file
with
649 addition
and
515 deletion
+649
-515
CMakeLists.txt
CMakeLists.txt
+3
-3
src/framework/cl/cl_engine.h
src/framework/cl/cl_engine.h
+3
-2
src/framework/cl/cl_helper.h
src/framework/cl/cl_helper.h
+1
-4
src/framework/cl/cl_image.cpp
src/framework/cl/cl_image.cpp
+27
-194
src/framework/cl/cl_image.h
src/framework/cl/cl_image.h
+64
-165
src/framework/cl/cl_image_converter.cpp
src/framework/cl/cl_image_converter.cpp
+301
-0
src/framework/cl/cl_image_converter.h
src/framework/cl/cl_image_converter.h
+74
-0
src/framework/cl/cl_scope.h
src/framework/cl/cl_scope.h
+4
-2
src/framework/cl/cl_tensor.h
src/framework/cl/cl_tensor.h
+1
-3
src/framework/executor.cpp
src/framework/executor.cpp
+0
-1
src/operators/kernel/arm/fetch_kernel.cpp
src/operators/kernel/arm/fetch_kernel.cpp
+1
-1
src/operators/kernel/cl/batchnorm_kernel.cpp
src/operators/kernel/cl/batchnorm_kernel.cpp
+3
-5
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+21
-14
src/operators/kernel/cl/conv_add_kernel.cpp
src/operators/kernel/cl/conv_add_kernel.cpp
+27
-16
src/operators/kernel/cl/conv_kernel.cpp
src/operators/kernel/cl/conv_kernel.cpp
+18
-18
src/operators/kernel/cl/depthwise_conv_kernel.cpp
src/operators/kernel/cl/depthwise_conv_kernel.cpp
+13
-10
src/operators/kernel/cl/feed_kernel.cpp
src/operators/kernel/cl/feed_kernel.cpp
+2
-2
src/operators/kernel/cl/fetch_kernel.cpp
src/operators/kernel/cl/fetch_kernel.cpp
+12
-12
src/operators/kernel/cl/pool_kernel.cpp
src/operators/kernel/cl/pool_kernel.cpp
+13
-6
src/operators/kernel/cl/relu_kernel.cpp
src/operators/kernel/cl/relu_kernel.cpp
+22
-22
src/operators/kernel/cl/reshape_kernel.cpp
src/operators/kernel/cl/reshape_kernel.cpp
+2
-2
src/operators/kernel/cl/softmax_kernel.cpp
src/operators/kernel/cl/softmax_kernel.cpp
+19
-19
src/operators/math/depthwise_conv_3x3.cpp
src/operators/math/depthwise_conv_3x3.cpp
+3
-3
test/net/test_mobilenet_GPU.cpp
test/net/test_mobilenet_GPU.cpp
+15
-11
未找到文件。
CMakeLists.txt
浏览文件 @
4001ca6f
cmake_minimum_required
(
VERSION 3.6
)
option
(
USE_OPENMP
"openmp support"
OFF
)
project
(
paddle-mobile
)
option
(
USE_OPENMP
"openmp support"
OFF
)
option
(
DEBUGING
"enable debug mode"
ON
)
option
(
USE_EXCEPTION
"use std exception"
ON
)
option
(
LOG_PROFILE
"log profile"
OFF
)
...
...
@@ -12,6 +10,8 @@ option(GPU_MALI "mali gpu" OFF)
option
(
GPU_CL
"opencl gpu"
ON
)
option
(
FPGA
"fpga"
OFF
)
project
(
paddle-mobile
)
file
(
GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm
)
file
(
GLOB_RECURSE PADDLE_MOBILE_H src/*.h
)
include_directories
(
src/
)
...
...
src/framework/cl/cl_engine.h
浏览文件 @
4001ca6f
...
...
@@ -90,7 +90,8 @@ class CLEngine {
bool
BuildProgram
(
cl_program
program
)
{
cl_int
status
;
status
=
clBuildProgram
(
program
,
0
,
0
,
"-cl-fast-relaxed-math -I cl_kernel"
,
0
,
0
);
status
=
clBuildProgram
(
program
,
0
,
0
,
"-cl-fast-relaxed-math -I cl_kernel"
,
0
,
0
);
CL_CHECK_ERRORS
(
status
);
...
...
@@ -98,7 +99,7 @@ class CLEngine {
size_t
log_size
;
clGetProgramBuildInfo
(
program
,
CLEngine
::
Instance
()
->
DeviceID
(),
CL_PROGRAM_BUILD_LOG
,
0
,
NULL
,
&
log_size
);
char
*
log
=
(
char
*
)
malloc
(
log_size
);
char
*
log
=
reinterpret_cast
<
char
*>
(
malloc
(
log_size
)
);
clGetProgramBuildInfo
(
program
,
CLEngine
::
Instance
()
->
DeviceID
(),
CL_PROGRAM_BUILD_LOG
,
log_size
,
log
,
NULL
);
DLOG
<<
" program build error: "
<<
log
;
...
...
src/framework/cl/cl_helper.h
浏览文件 @
4001ca6f
...
...
@@ -49,9 +49,6 @@ class CLHelper {
cl_context
CLContext
()
{
return
scope_
->
Context
();
}
std
::
vector
<
size_t
>
DefaultWorkSize
(
const
CLImage
&
image
)
{
if
(
image
.
GetImageType
()
==
Invalid
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" not support image type"
);
}
// n c h w
auto
image_dim
=
image
.
dims
();
if
(
image_dim
.
size
()
==
4
)
{
...
...
@@ -66,7 +63,7 @@ class CLHelper {
}
else
if
(
image_dim
.
size
()
==
2
)
{
return
{
1
,
image
.
ImageWidth
(),
image
.
ImageHeight
()};
}
else
if
(
image_dim
.
size
()
==
1
)
{
return
{
1
,
image
.
ImageWidth
(),
1
};
return
{
1
,
image
.
ImageWidth
(),
1
};
}
PADDLE_MOBILE_THROW_EXCEPTION
(
" not support this dim, need imp "
);
}
...
...
src/framework/cl/cl_image.cpp
浏览文件 @
4001ca6f
...
...
@@ -16,214 +16,47 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
framework
{
void
CLImageToTensor
(
CLImage
*
cl_image
,
Tensor
*
tensor
,
cl_command_queue
commandQueue
)
{
DDim
ddim
=
cl_image
->
dims
();
size_t
N
,
C
,
H
,
W
;
if
(
ddim
.
size
()
==
4
)
{
N
=
ddim
[
0
];
if
(
N
<
0
)
{
N
=
1
;
}
C
=
ddim
[
1
];
H
=
ddim
[
2
];
W
=
ddim
[
3
];
}
else
if
(
ddim
.
size
()
==
1
)
{
N
=
1
;
C
=
ddim
[
0
];
H
=
1
;
W
=
1
;
}
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
float
*
p
=
tensor
->
mutable_data
<
float
>
();
half
imageData
[
width
*
height
*
4
];
cl_int
err
;
cl_mem
image
=
cl_image
->
GetCLImage
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
width
,
height
,
1
};
err
=
clEnqueueReadImage
(
commandQueue
,
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
imageData
,
0
,
NULL
,
NULL
);
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
size_t
i1
=
i0
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
imageData
[
i2
]);
i2
+=
4
;
p
++
;
}
i1
+=
width
;
}
}
i0
+=
width
*
H
;
}
if
(
err
!=
CL_SUCCESS
)
{
CL_CHECK_ERRORS
(
err
);
}
// TODO(yangfei): need imp
}
void
TensorToCLImage
(
const
Tensor
*
tensor
,
CLImage
*
cl_image
,
cl_command_queue
commandQueue
)
{
DDim
ddim
=
cl_image
->
dims
();
size_t
N
,
C
,
H
,
W
;
if
(
ddim
.
size
()
==
4
)
{
N
=
ddim
[
0
];
if
(
N
<
0
)
{
N
=
1
;
}
C
=
ddim
[
1
];
H
=
ddim
[
2
];
W
=
ddim
[
3
];
}
else
if
(
ddim
.
size
()
==
1
)
{
N
=
1
;
C
=
ddim
[
0
];
H
=
1
;
W
=
1
;
}
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
const
float
*
p
=
tensor
->
data
<
float
>
();
half
imageData
[
width
*
height
*
4
];
cl_mem
image
=
cl_image
->
GetCLImage
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
width
,
height
,
1
};
cl_int
err
;
err
=
clEnqueueReadImage
(
commandQueue
,
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
imageData
,
0
,
NULL
,
NULL
);
if
(
err
!=
CL_SUCCESS
)
{
CL_CHECK_ERRORS
(
err
);
}
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
size_t
i1
=
i0
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
imageData
[
i2
]
=
Float2Half
(
*
p
);
i2
+=
4
;
p
++
;
}
i1
+=
width
;
}
}
i0
+=
width
*
H
;
}
// TODO(yangfei): need imp
}
#ifdef PADDLE_MOBILE_DEBUG
Print
&
operator
<<
(
Print
&
printer
,
const
CLImage
&
cl_image
)
{
if
(
cl_image
.
GetImageType
()
==
Invalid
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" not support image type"
);
}
printer
<<
" dims: "
<<
cl_image
.
dims
()
<<
"
\n
"
;
int
stride
=
cl_image
.
numel
()
/
20
;
stride
=
stride
>
0
?
stride
:
1
;
float
*
data
=
new
float
[
cl_image
.
numel
()];
DDim
ddim
=
cl_image
.
dims
();
size_t
N
,
C
,
H
,
W
,
width
,
height
;
if
(
cl_image
.
GetImageType
()
==
Normal
||
cl_image
.
dims
().
size
()
==
3
||
cl_image
.
dims
().
size
()
==
4
)
{
if
(
ddim
.
size
()
==
4
)
{
N
=
ddim
[
0
];
if
(
N
<
0
)
{
N
=
1
;
}
C
=
ddim
[
1
];
H
=
ddim
[
2
];
W
=
ddim
[
3
];
width
=
W
*
((
C
+
3
)
/
4
);
height
=
N
*
H
;
}
else
if
(
ddim
.
size
()
==
2
)
{
width
=
ddim
[
1
];
height
=
ddim
[
0
];
N
=
1
;
C
=
1
;
H
=
ddim
[
0
];
W
=
ddim
[
1
];
}
else
if
(
ddim
.
size
()
==
1
)
{
width
=
ddim
[
0
];
height
=
1
;
N
=
1
;
C
=
1
;
H
=
1
;
W
=
ddim
[
0
];
}
float
*
p
=
data
;
half
*
imageData
=
new
half
[
height
*
width
*
4
];
cl_int
err
;
cl_mem
image
=
cl_image
.
GetCLImage
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
width
,
height
,
1
};
err
=
clEnqueueReadImage
(
cl_image
.
CommandQueue
(),
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
imageData
,
0
,
NULL
,
NULL
);
if
(
err
!=
CL_SUCCESS
)
{
printf
(
"ImageWidth %ld
\n
"
,
cl_image
.
ImageWidth
());
printf
(
"ImageWidth %ld
\n
"
,
cl_image
.
ImageHeight
());
}
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
size_t
i1
=
i0
+
(
c
/
4
)
*
W
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
imageData
[
i2
]);
i2
+=
4
;
p
++
;
}
i1
+=
width
;
}
}
i0
+=
width
*
H
;
}
delete
(
imageData
);
CL_CHECK_ERRORS
(
err
);
int
width
=
cl_image
.
ImageDims
()[
0
];
int
height
=
cl_image
.
ImageDims
()[
1
];
}
else
{
if
(
ddim
.
size
()
==
2
)
{
width
=
(
ddim
[
1
]
+
3
)
/
4
;
height
=
ddim
[
0
];
H
=
ddim
[
0
];
W
=
ddim
[
1
];
half_t
*
image_data
=
new
half_t
[
height
*
width
*
4
];
cl_int
err
;
cl_mem
image
=
cl_image
.
GetCLImage
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
width
,
height
,
1
};
err
=
clEnqueueReadImage
(
cl_image
.
CommandQueue
(),
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
image_data
,
0
,
NULL
,
NULL
);
}
else
if
(
ddim
.
size
()
==
1
)
{
width
=
(
ddim
[
0
]
+
3
)
/
4
;
height
=
1
;
H
=
1
;
W
=
ddim
[
0
];
}
float
*
p
=
data
;
half
*
imageData
=
new
half
[
width
*
height
*
4
];
cl_int
err
;
cl_mem
image
=
cl_image
.
GetCLImage
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
width
,
height
,
1
};
err
=
clEnqueueReadImage
(
cl_image
.
CommandQueue
(),
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
imageData
,
0
,
NULL
,
NULL
);
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
p
[
h
*
W
+
w
]
=
Half2Float
(
imageData
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]);
}
}
CL_CHECK_ERRORS
(
err
);
delete
(
imageData
);
CL_CHECK_ERRORS
(
err
);
}
float
*
tensor_data
=
new
float
[
cl_image
.
numel
()];
auto
converter
=
cl_image
.
Converter
();
converter
->
ImageToNCHW
(
image_data
,
tensor_data
,
cl_image
.
ImageDims
(),
cl_image
.
dims
());
int
stride
=
cl_image
.
numel
()
/
20
;
stride
=
stride
>
0
?
stride
:
1
;
printer
<<
" dims: "
<<
cl_image
.
dims
()
<<
"
\n
"
;
for
(
int
i
=
0
;
i
<
cl_image
.
numel
();
i
+=
stride
)
{
printer
<<
data
[
i
]
<<
" "
;
printer
<<
tensor_
data
[
i
]
<<
" "
;
}
delete
(
data
);
delete
[](
tensor_data
);
delete
[](
image_data
);
return
printer
;
}
#endif
...
...
src/framework/cl/cl_image.h
浏览文件 @
4001ca6f
...
...
@@ -18,22 +18,30 @@ limitations under the License. */
#include "CL/cl.h"
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_tool.h"
#include "framework/cl/cl_deleter.h"
#include "framework/cl/cl_engine.h"
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_image_converter.h"
#include "framework/cl/cl_tool.h"
#include "framework/ddim.h"
#include "framework/tensor.h"
namespace
paddle_mobile
{
namespace
framework
{
enum
ImageType
{
Invalid
=
-
1
,
Normal
=
0
,
Folder
=
1
};
class
CLImage
{
public:
CLImage
()
=
default
;
~
CLImage
()
{
if
(
tensor_data_
!=
nullptr
)
{
delete
[](
tensor_data_
);
}
if
(
image_converter_
)
{
delete
(
image_converter_
);
}
}
/*
* will not hold input tensor data, memcpy in this method
* */
...
...
@@ -54,79 +62,79 @@ class CLImage {
* folder when one dim or two dim
* */
void
InitCLImage
(
cl_context
context
,
cl_command_queue
command_queue
)
{
if
(
tensor_data_
==
nullptr
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" need call SetTensorData first"
);
}
DLOG
<<
tensor_dims_
;
if
(
tensor_dims_
.
size
()
<=
2
)
{
DLOG
<<
" dim <= 2 folder ~~~~~ "
;
InitCLImage2C
(
context
,
command_queue
,
tensor_data_
,
tensor_dims_
);
}
else
{
DLOG
<<
" dim > 2 norm ~~~~~ "
;
InitCLImage
(
context
,
command_queue
,
tensor_data_
,
tensor_dims
_
);
PADDLE_MOBILE_ENFORCE
(
tensor_data_
!=
nullptr
,
" need call SetTensorData first"
);
CLImageConverterFolder
*
folder_converter
=
new
CLImageConverterFolder
();
InitCLImage
(
context
,
command_queue
,
folder_converter
)
;
}
void
InitCLImage
(
cl_context
context
,
cl_command_queue
command_queue
,
CLImageConverterBase
*
converter
)
{
if
(
image_converter_
!=
nullptr
)
{
delete
(
image_converter
_
);
}
PADDLE_MOBILE_ENFORCE
(
tensor_data_
!=
nullptr
,
" need call SetTensorData first"
);
DLOG
<<
" begin init cl image "
;
image_dims_
=
converter
->
InitImageDimInfoWith
(
tensor_dims_
);
half_t
*
image_data
=
new
half_t
[
product
(
image_dims_
)
*
4
];
DLOG
<<
" convert to image"
;
converter
->
NCHWToImage
(
tensor_data_
,
image_data
,
tensor_dims_
);
DLOG
<<
" end convert to image"
;
InitCLImage
(
context
,
image_dims_
[
0
],
image_dims_
[
1
],
image_data
);
delete
[](
image_data
);
delete
[](
tensor_data_
);
command_queue_
=
command_queue
;
tensor_data_
=
nullptr
;
image_converter_
=
converter
;
initialized_
=
true
;
DLOG
<<
" end init cl image"
;
}
/*
* need call SetTensorData first
* */
void
InitCLImageNormal
(
cl_context
context
,
cl_command_queue
command_queue
)
{
void
InitNImage
(
cl_context
context
,
cl_command_queue
command_queue
)
{
if
(
tensor_data_
==
nullptr
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" need call SetTensorData first"
);
}
InitCLImage
(
context
,
command_queue
,
tensor_data_
,
tensor_dims_
);
delete
[](
tensor_data_
);
tensor_data_
=
nullptr
;
initialized_
=
true
;
CLImageConverterNWBlock
*
folder_converter
=
new
CLImageConverterNWBlock
();
InitCLImage
(
context
,
command_queue
,
folder_converter
);
PADDLE_MOBILE_ENFORCE
(
tensor_dims_
.
size
()
==
4
,
" tensor dim is not 4"
);
}
void
InitEmptyImage
(
cl_context
context
,
cl_command_queue
command_queue
,
const
DDim
&
dim
)
{
if
(
tensor_data_
!=
nullptr
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" empty image tensor data shouldn't have value"
);
}
DLOG
<<
" init empty image "
;
if
(
tensor_dims_
.
size
()
<=
2
)
{
DLOG
<<
" dim <= 2 folder ~~~~~ "
;
InitCLImage2C
(
context
,
command_queue
,
tensor_data_
,
tensor_dims_
);
}
else
{
DLOG
<<
" dim > 2 norm ~~~~~ "
;
InitCLImage
(
context
,
command_queue
,
tensor_data_
,
tensor_dims_
);
}
PADDLE_MOBILE_ENFORCE
(
tensor_data_
==
nullptr
,
" empty image tensor data shouldn't have value"
);
cl_event_
=
CLEngine
::
Instance
()
->
CreateEvent
(
context
);
CLImageConverterFolder
*
folder_converter
=
new
CLImageConverterFolder
();
DLOG
<<
" to get image dims "
;
image_dims_
=
folder_converter
->
InitImageDimInfoWith
(
dim
);
DLOG
<<
" end get image dims "
<<
image_dims_
;
InitCLImage
(
context
,
image_dims_
[
0
],
image_dims_
[
1
],
nullptr
);
// InitCLImage(context, command_queue, nullptr, dim);
tensor_dims_
=
dim
;
command_queue_
=
command_queue
;
image_converter_
=
folder_converter
;
cl_event_
=
CLEngine
::
Instance
()
->
CreateEvent
(
context
);
initialized_
=
true
;
DLOG
<<
" end init cl image"
;
}
cl_mem
GetCLImage
()
const
{
return
cl_image_
.
get
();
}
const
DDim
&
ImageDims
()
const
{
return
image_dims_
;
}
inline
size_t
ImageWidth
()
const
{
return
image_width_
;
}
inline
size_t
ImageHeight
()
const
{
return
image_height_
;
}
inline
size_t
ImageWidth
()
const
{
return
image_dims_
[
0
];
}
/*
* block of channels, 4 channel one block
* */
inline
size_t
CBlock
()
const
{
return
c_block_
;
}
/*
* width of original tensor
* */
inline
size_t
WidthOfOneBlock
()
const
{
return
width_of_one_block_
;
}
/*
* height of original tensor
* */
inline
size_t
HeightOfOneBlock
()
const
{
return
height_of_one_block_
;
}
inline
size_t
ImageHeight
()
const
{
return
image_dims_
[
1
];
}
inline
cl_command_queue
CommandQueue
()
const
{
return
command_queue_
;
}
...
...
@@ -158,47 +166,11 @@ class CLImage {
* */
const
DDim
&
dims
()
const
{
return
tensor_dims_
;
}
const
ImageType
GetImageType
()
const
{
return
image_type_
;
}
cl_event
GetClEvent
()
const
{
return
cl_event_
.
get
();
}
private:
ImageType
image_type_
=
Invalid
;
void
InitCLImage2C
(
cl_context
context
,
cl_command_queue
command_queue
,
float
*
tensor_data
,
const
DDim
&
dim
)
{
image_type_
=
Folder
;
command_queue_
=
command_queue
;
assert
(
dim
.
size
()
<=
2
);
int
tdim
[
2
]
=
{
1
,
1
};
if
(
dim
.
size
()
==
1
)
{
tdim
[
1
]
=
dim
[
0
];
}
else
{
tdim
[
0
]
=
dim
[
0
];
tdim
[
1
]
=
dim
[
1
];
}
int
width
=
(
tdim
[
1
]
+
3
)
/
4
;
int
height
=
tdim
[
0
];
image_width_
=
width
;
image_height_
=
height
;
image_dims_
=
make_ddim
({
width
,
height
});
width_of_one_block_
=
width
;
height_of_one_block_
=
height
;
c_block_
=
1
;
std
::
unique_ptr
<
half_t
[]
>
imageData
{};
if
(
tensor_data
)
{
imageData
.
reset
(
new
half_t
[
width
*
height
*
4
]);
for
(
int
h
=
0
;
h
<
tdim
[
0
];
h
++
)
{
for
(
int
w
=
0
;
w
<
tdim
[
1
];
w
++
)
{
imageData
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
Float2Half
(
tensor_data
[
h
*
tdim
[
1
]
+
w
]);
}
}
}
InitCLImage
(
context
,
width
,
height
,
imageData
.
get
());
}
CLImageConverterBase
*
Converter
()
const
{
return
image_converter_
;
}
private:
void
InitCLImage
(
cl_context
context
,
int
width
,
int
height
,
void
*
data
)
{
cl_image_format
cf
=
{.
image_channel_order
=
CL_RGBA
,
.
image_channel_data_type
=
CL_HALF_FLOAT
};
...
...
@@ -228,89 +200,16 @@ class CLImage {
PADDLE_MOBILE_THROW_EXCEPTION
(
" create image 2d error "
);
}
}
void
InitCLImage
(
cl_context
context
,
cl_command_queue
command_queue
,
float
*
tensor_data
,
const
DDim
&
dim
)
{
image_type_
=
Normal
;
DLOG
<<
" tensor dim: "
<<
dim
;
// NCHW -> [W * (C+3)/4, H * N]
tensor_dims_
=
dim
;
command_queue_
=
command_queue
;
if
(
tensor_data
)
{
tensor_data_
=
tensor_data
;
}
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
int
j
=
0
;
j
<
dim
.
size
();
++
j
)
{
new_dims
[
4
-
dim
.
size
()
+
j
]
=
dim
[
j
];
}
size_t
N
,
C
,
H
,
W
;
N
=
new_dims
[
0
];
C
=
new_dims
[
1
];
H
=
new_dims
[
2
];
W
=
new_dims
[
3
];
width_of_one_block_
=
W
;
height_of_one_block_
=
H
;
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
image_width_
=
width
;
image_height_
=
height
;
image_dims_
=
make_ddim
({
image_width_
,
image_height_
});
c_block_
=
width
/
W
;
DLOG
<<
" tensor dim "
<<
tensor_dims_
;
DLOG
<<
" 赋值时: image width: "
<<
image_width_
;
DLOG
<<
" 赋值时: image height: "
<<
image_height_
;
std
::
unique_ptr
<
half_t
[]
>
imageData
{};
int
count
=
0
;
if
(
tensor_data
!=
nullptr
)
{
imageData
.
reset
(
new
half_t
[
width
*
height
*
4
]);
float
*
p
=
tensor_data
;
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
c_block_
*
4
;
c
++
)
{
size_t
i1
=
i0
+
(
c
/
4
)
*
W
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
if
(
c
<
C
)
{
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
imageData
[
i2
]
=
Float2Half
(
*
p
);
i2
+=
4
;
p
++
;
}
else
{
imageData
[
i2
]
=
0.0
;
i2
+=
4
;
}
}
i1
+=
width
;
}
}
i0
+=
width
*
H
;
}
}
InitCLImage
(
context
,
width
,
height
,
imageData
.
get
());
}
bool
initialized_
=
false
;
std
::
unique_ptr
<
_cl_mem
,
CLMemDeleter
>
cl_image_
;
std
::
unique_ptr
<
_cl_event
,
CLEventDeleter
>
cl_event_
;
size_t
image_width_
;
size_t
width_of_one_block_
;
size_t
height_of_one_block_
;
size_t
image_height_
;
size_t
c_block_
;
DDim
tensor_dims_
;
DDim
image_dims_
;
float
*
tensor_data_
=
nullptr
;
cl_context
context_
;
cl_command_queue
command_queue_
;
CLImageConverterBase
*
image_converter_
=
nullptr
;
};
void
TensorToCLImage
(
Tensor
*
tensor
,
CLImage
*
image
,
...
...
src/framework/cl/cl_image_converter.cpp
0 → 100644
浏览文件 @
4001ca6f
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/cl/cl_image_converter.h"
namespace
paddle_mobile
{
namespace
framework
{
const
DDim
&
CLImageConverterDefault
::
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
int
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
new_dims
[
4
-
tensor_dim
.
size
()
+
j
]
=
tensor_dim
[
j
];
}
size_t
N
,
C
,
H
,
W
;
N
=
new_dims
[
0
];
C
=
new_dims
[
1
];
H
=
new_dims
[
2
];
W
=
new_dims
[
3
];
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
return
make_ddim
({
width
,
height
});
}
void
CLImageConverterDefault
::
NCHWToImage
(
float
*
nchw
,
half_t
*
image
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
int
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
new_dims
[
4
-
tensor_dim
.
size
()
+
j
]
=
tensor_dim
[
j
];
}
size_t
N
,
C
,
H
,
W
;
N
=
new_dims
[
0
];
C
=
new_dims
[
1
];
H
=
new_dims
[
2
];
W
=
new_dims
[
3
];
DDim
in_image_dim
=
InitImageDimInfoWith
(
tensor_dim
);
DLOG
<<
" tensor dim "
<<
tensor_dim
;
DLOG
<<
" image dim "
<<
in_image_dim
;
size_t
width
=
in_image_dim
[
0
];
size_t
height
=
in_image_dim
[
1
];
int
w_block
=
width
/
W
;
float
*
p
=
nchw
;
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
w_block
*
4
;
c
++
)
{
size_t
i1
=
i0
+
(
c
/
4
)
*
W
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
if
(
c
<
C
)
{
// int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image
[
i2
]
=
Float2Half
(
*
p
);
i2
+=
4
;
p
++
;
}
else
{
image
[
i2
]
=
0.0
;
i2
+=
4
;
}
}
i1
+=
width
;
}
}
i0
+=
width
*
H
;
}
}
void
CLImageConverterDefault
::
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
int
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
new_dims
[
4
-
tensor_dim
.
size
()
+
j
]
=
tensor_dim
[
j
];
}
size_t
N
,
C
,
H
,
W
;
N
=
new_dims
[
0
];
C
=
new_dims
[
1
];
H
=
new_dims
[
2
];
W
=
new_dims
[
3
];
int
width
=
image_dim
[
0
];
int
height
=
image_dim
[
0
];
float
*
p
=
tensor
;
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
size_t
i1
=
i0
+
(
c
/
4
)
*
W
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
image
[
i2
]);
i2
+=
4
;
p
++
;
}
i1
+=
width
;
}
}
i0
+=
width
*
H
;
}
}
const
DDim
&
CLImageConverterFolder
::
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
)
{
if
(
tensor_dim
.
size
()
<=
2
)
{
int
tdim
[
2
]
=
{
1
,
1
};
if
(
tensor_dim
.
size
()
==
1
)
{
tdim
[
1
]
=
tensor_dim
[
0
];
}
else
{
tdim
[
0
]
=
tensor_dim
[
0
];
tdim
[
1
]
=
tensor_dim
[
1
];
}
int
width
=
(
tdim
[
1
]
+
3
)
/
4
;
int
height
=
tdim
[
0
];
width_of_one_block_
=
width
;
height_of_one_block_
=
height
;
c_block_
=
1
;
return
make_ddim
({
width
,
height
});
}
else
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
int
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
new_dims
[
4
-
tensor_dim
.
size
()
+
j
]
=
tensor_dim
[
j
];
}
size_t
N
,
C
,
H
,
W
;
N
=
new_dims
[
0
];
C
=
new_dims
[
1
];
H
=
new_dims
[
2
];
W
=
new_dims
[
3
];
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
width_of_one_block_
=
W
;
height_of_one_block_
=
H
;
c_block_
=
width
/
W
;
return
make_ddim
({
width
,
height
});
}
}
void
CLImageConverterFolder
::
NCHWToImage
(
float
*
tensor
,
half_t
*
image
,
const
DDim
&
tensor_dim
)
{
PADDLE_MOBILE_ENFORCE
(
tensor_dim
.
size
()
<=
4
&&
tensor_dim
.
size
()
>
0
,
"tensor dim is not support "
);
if
(
tensor_dim
.
size
()
>
2
)
{
CLImageConverterDefault
default_converter
;
default_converter
.
NCHWToImage
(
tensor
,
image
,
tensor_dim
);
}
else
{
int
tdim
[
2
]
=
{
1
,
1
};
if
(
tensor_dim
.
size
()
==
1
)
{
tdim
[
1
]
=
tensor_dim
[
0
];
}
else
{
tdim
[
0
]
=
tensor_dim
[
0
];
tdim
[
1
]
=
tensor_dim
[
1
];
}
DDim
image_dim
=
InitImageDimInfoWith
(
tensor_dim
);
int
width
=
image_dim
[
0
];
for
(
int
h
=
0
;
h
<
tdim
[
0
];
h
++
)
{
for
(
int
w
=
0
;
w
<
tdim
[
1
];
w
++
)
{
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
Float2Half
(
tensor
[
h
*
tdim
[
1
]
+
w
]);
}
}
}
}
void
CLImageConverterFolder
::
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
if
(
tensor_dim
.
size
()
>
2
)
{
CLImageConverterDefault
default_converter
;
default_converter
.
ImageToNCHW
(
image
,
tensor
,
image_dim
,
tensor_dim
);
}
else
{
int
width
=
image_dim
[
0
];
int
height
=
image_dim
[
1
];
int
H
,
W
;
if
(
tensor_dim
.
size
()
==
2
)
{
H
=
tensor_dim
[
0
];
W
=
tensor_dim
[
1
];
}
else
if
(
tensor_dim
.
size
()
==
1
)
{
H
=
1
;
W
=
tensor_dim
[
0
];
}
float
*
p
=
tensor
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
p
[
h
*
W
+
w
]
=
Half2Float
(
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]);
}
}
}
}
const
DDim
&
CLImageConverterNWBlock
::
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
)
{
PADDLE_MOBILE_ENFORCE
(
tensor_dim
.
size
()
==
4
,
" tensor dim is not 4"
);
size_t
N
,
C
,
H
,
W
;
N
=
tensor_dim
[
0
];
C
=
tensor_dim
[
1
];
H
=
tensor_dim
[
2
];
W
=
tensor_dim
[
3
];
size_t
width
=
W
*
((
N
+
3
)
/
4
);
size_t
height
=
C
*
H
;
return
make_ddim
({
width
,
height
});
}
void
CLImageConverterNWBlock
::
NCHWToImage
(
float
*
tensor
,
half_t
*
image
,
const
DDim
&
tensor_dim
)
{
PADDLE_MOBILE_ENFORCE
(
tensor_dim
.
size
()
==
4
,
" tensor dim is not 4"
);
auto
image_dim
=
InitImageDimInfoWith
(
tensor_dim
);
float
*
p
=
tensor
;
int
N
=
tensor_dim
[
0
];
int
C
=
tensor_dim
[
1
];
int
H
=
tensor_dim
[
2
];
int
W
=
tensor_dim
[
3
];
int
width
=
image_dim
[
0
];
int
height
=
image_dim
[
1
];
int
block
=
image_dim
[
0
]
/
tensor_dim
[
3
];
for
(
int
n
=
0
;
n
<
block
*
4
;
n
++
)
{
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
int
index
=
4
*
c
*
(
width
*
H
)
+
4
*
(
n
/
4
)
*
H
*
W
+
h
*
W
*
4
+
w
*
4
+
(
n
%
4
);
if
(
n
<
N
)
{
image
[
index
]
=
Float2Half
(
*
p
);
p
++
;
}
else
{
image
[
index
]
=
0.0
;
}
if
(
index
>=
(
width
*
height
*
4
))
{
DLOG
<<
" index out of range "
;
}
}
}
}
}
DLOG
<<
" init done"
;
}
void
CLImageConverterNWBlock
::
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
PADDLE_MOBILE_ENFORCE
(
tensor_dim
.
size
()
==
4
,
" tensor dim is not 4"
);
float
*
p
=
tensor
;
int
N
=
tensor_dim
[
0
];
int
C
=
tensor_dim
[
1
];
int
H
=
tensor_dim
[
2
];
int
W
=
tensor_dim
[
3
];
int
width
=
image_dim
[
0
];
int
height
=
image_dim
[
1
];
int
block
=
image_dim
[
0
]
/
tensor_dim
[
3
];
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
for
(
int
h
=
0
;
h
<
H
;
++
h
)
{
for
(
int
w
=
0
;
w
<
W
;
++
w
)
{
int
index
=
4
*
c
*
(
width
*
H
)
+
4
*
(
n
/
4
)
*
H
*
W
+
h
*
W
*
4
+
w
*
4
+
(
n
%
4
);
*
p
=
Half2Float
(
image
[
index
]);
p
++
;
if
(
index
>=
(
width
*
height
*
4
))
{
DLOG
<<
" index out of range "
;
}
}
}
}
}
DLOG
<<
" init done"
;
}
}
// namespace framework
}
// namespace paddle_mobile
src/framework/cl/cl_image_converter.h
0 → 100644
浏览文件 @
4001ca6f
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/cl/cl_half.h"
#include "framework/ddim.h"
namespace
paddle_mobile
{
namespace
framework
{
class
CLImageConverterBase
{
public:
virtual
void
NCHWToImage
(
float
*
nchw
,
half_t
*
image
,
const
DDim
&
tensor_dim
)
=
0
;
virtual
void
ImageToNCHW
(
half_t
*
image
,
float
*
nchw
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
=
0
;
virtual
const
DDim
&
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
)
=
0
;
};
class
CLImageConverterDefault
:
public
CLImageConverterBase
{
public:
const
DDim
&
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
nchw
,
half_t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
class
CLImageConverterFolder
:
public
CLImageConverterBase
{
public:
const
DDim
&
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
/*
* width of original tensor
* */
inline
size_t
WidthOfOneBlock
()
const
{
return
width_of_one_block_
;
}
/*
* height of original tensor
* */
inline
size_t
HeightOfOneBlock
()
const
{
return
height_of_one_block_
;
}
int
GetCBlock
()
const
{
return
c_block_
;
}
private:
int
c_block_
;
int
width_of_one_block_
;
int
height_of_one_block_
;
};
class
CLImageConverterNWBlock
:
public
CLImageConverterBase
{
const
DDim
&
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
}
// namespace framework
}
// namespace paddle_mobile
src/framework/cl/cl_scope.h
浏览文件 @
4001ca6f
...
...
@@ -38,12 +38,14 @@ class CLScope {
std
::
unique_ptr
<
_cl_kernel
,
CLKernelDeleter
>
GetKernel
(
const
std
::
string
&
kernel_name
,
const
std
::
string
&
file_name
)
{
DLOG
<<
" to get program "
<<
file_name
;
auto
program
=
Program
(
file_name
);
DLOG
<<
" get program ~ "
;
DLOG
<<
" end get program ~ "
;
DLOG
<<
" to create kernel: "
<<
kernel_name
;
std
::
unique_ptr
<
_cl_kernel
,
CLKernelDeleter
>
kernel
(
clCreateKernel
(
program
,
kernel_name
.
c_str
(),
&
status_
));
CL_CHECK_ERRORS
(
status_
);
DLOG
<<
" create kernel ~ "
;
DLOG
<<
"
end
create kernel ~ "
;
return
std
::
move
(
kernel
);
}
...
...
src/framework/cl/cl_tensor.h
浏览文件 @
4001ca6f
...
...
@@ -115,9 +115,7 @@ class CLTensor : TensorBase {
return
reinterpret_cast
<
T
*>
(
host_ptr_
);
}
int
memorySize
()
{
return
holder_
->
size
();
}
int
memorySize
()
{
return
holder_
->
size
();
}
~
CLTensor
()
{
DLOG
<<
"~CLTensor"
;
...
...
src/framework/executor.cpp
浏览文件 @
4001ca6f
...
...
@@ -429,7 +429,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
}
#endif
auto
last_op
=
ops
.
rbegin
();
auto
output_map
=
(
*
last_op
)
->
Outputs
();
std
::
vector
<
std
::
string
>
out_keys
=
(
*
last_op
)
->
GetOutKeys
();
...
...
src/operators/kernel/arm/fetch_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -23,4 +23,4 @@ void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> ¶m) {
template
class
FetchKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
\ No newline at end of file
#endif
src/operators/kernel/cl/batchnorm_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -76,18 +76,16 @@ void BatchNormKernel<GPU_CL, float>::Compute(
auto
out
=
param
.
OutputY
()
->
GetCLImage
();
auto
new_scale
=
param
.
NewScale
()
->
GetCLImage
();
auto
new_bias
=
param
.
NewBias
()
->
GetCLImage
();
const
int
out_height
=
param
.
OutputY
()
->
HeightOfOneBlock
();
const
int
out_width
=
param
.
OutputY
()
->
WidthOfOneBlock
();
const
int
out_width
=
default_work_size
[
1
];
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
out_height
);
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
out_width
);
clSetKernelArg
(
kernel
,
2
,
sizeof
(
cl_mem
),
&
input
);
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
new_scale
);
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
new_bias
);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
out
);
// cl_event out_event = param.OutputY()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
// cl_event out_event = param.OutputY()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
}
...
...
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -37,8 +37,7 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
if
(
filter_ddim
[
1
]
==
1
)
{
param
->
Filter
()
->
Resize
(
ddim
);
}
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
param
->
Bias
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
...
...
@@ -135,19 +134,25 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param
->
SetOffset
(
offset
);
if
(
param
->
Filter
()
->
WidthOfOneBlock
()
==
1
&&
param
->
Filter
()
->
HeightOfOneBlock
()
==
1
)
{
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
param
->
Filter
()
->
InitNImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_add_bn_relu_kernel.cl"
);
DLOG
<<
" conv add bn relu conv 1x1"
;
}
else
if
(
param
->
Filter
()
->
dims
()[
0
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
==
3
)
{
// this->cl_helper_.AddKernel("depth_conv_3x3"
,
// "conv_add_bn_relu_kernel.cl"
);
this
->
cl_helper_
.
AddKernel
(
"depth_conv_3x3"
,
"
depthwise_
conv_add_bn_relu_kernel.cl"
);
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
()
,
cl_helper_
.
CLCommandQueue
()
);
this
->
cl_helper_
.
AddKernel
(
"depth_conv_3x3"
,
"conv_add_bn_relu_kernel.cl"
);
DLOG
<<
" conv add bn relu depth_conv_3x3"
;
}
else
if
(
param
->
Filter
()
->
WidthOfOneBlock
()
==
3
&&
param
->
Filter
()
->
HeightOfOneBlock
()
==
3
)
{
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_3x3"
,
"conv_add_bn_relu_kernel.cl"
);
DLOG
<<
" conv add bn relu conv_3x3"
;
}
else
{
...
...
@@ -173,12 +178,14 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
auto
output
=
param
.
Output
()
->
GetCLImage
();
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
input_c
=
param
.
Input
()
->
CBlock
();
int
input_c
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Input
()
->
Converter
())
->
GetCBlock
();
int
dilation
=
param
.
Dilations
()[
0
];
int
input_width
=
param
.
Input
()
->
WidthOfOneBlock
()
;
int
input_height
=
param
.
Input
()
->
HeightOfOneBlock
()
;
int
output_width
=
param
.
Output
()
->
WidthOfOneBlock
()
;
int
output_height
=
param
.
Output
()
->
HeightOfOneBlock
()
;
int
input_width
=
param
.
Input
()
->
dims
()[
3
]
;
int
input_height
=
param
.
Input
()
->
dims
()[
2
]
;
int
output_width
=
param
.
Output
()
->
dims
()[
3
]
;
int
output_height
=
param
.
Output
()
->
dims
()[
2
]
;
// DLOG << " c block " << c_block;
// DLOG << " w " << w;
...
...
src/operators/kernel/cl/conv_add_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -25,8 +25,6 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
param
->
Bias
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
...
...
@@ -34,14 +32,24 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
param
->
SetOffset
(
offset
);
if
(
param
->
Filter
()
->
WidthOfOneBlock
()
==
1
&&
param
->
Filter
()
->
HeightOfOneBlock
()
==
1
)
{
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
param
->
Filter
()
->
InitNImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_add_kernel.cl"
);
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
)
{
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"depth_conv_3x3"
,
"conv_add_kernel.cl"
);
}
else
if
(
param
->
Filter
()
->
WidthOfOneBlock
()
==
3
&&
param
->
Filter
()
->
HeightOfOneBlock
()
==
3
)
{
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
this
->
cl_helper_
.
AddKernel
(
"conv_3x3"
,
"conv_add_kernel.cl"
);
}
else
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" not support "
);
}
...
...
@@ -63,12 +71,15 @@ void ConvAddKernel<GPU_CL, float>::Compute(
auto
output
=
param
.
Output
()
->
GetCLImage
();
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
input_c
=
param
.
Input
()
->
CBlock
();
int
input_c
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Input
()
->
Converter
())
->
GetCBlock
();
int
dilation
=
param
.
Dilations
()[
0
];
int
input_width
=
param
.
Input
()
->
WidthOfOneBlock
();
int
input_height
=
param
.
Input
()
->
HeightOfOneBlock
();
int
output_width
=
param
.
Output
()
->
WidthOfOneBlock
();
int
output_height
=
param
.
Output
()
->
HeightOfOneBlock
();
int
input_width
=
param
.
Input
()
->
dims
()[
3
];
int
input_height
=
param
.
Input
()
->
dims
()[
2
];
int
output_width
=
param
.
Output
()
->
dims
()[
3
];
int
output_height
=
param
.
Output
()
->
dims
()[
2
];
cl_int
status
;
...
...
@@ -117,12 +128,12 @@ void ConvAddKernel<GPU_CL, float>::Compute(
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
...
...
src/operators/kernel/cl/conv_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -29,7 +29,7 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
auto
filter_ddim
=
param
->
Filter
()
->
dims
();
std
::
vector
<
int64_t
>
filter_shape
(
{
filter_ddim
[
1
],
filter_ddim
[
0
],
filter_ddim
[
2
],
filter_ddim
[
3
]});
{
filter_ddim
[
1
],
filter_ddim
[
0
],
filter_ddim
[
2
],
filter_ddim
[
3
]});
framework
::
DDim
ddim
=
framework
::
make_ddim
(
filter_shape
);
if
(
filter_ddim
[
1
]
==
1
)
{
param
->
Filter
()
->
Resize
(
ddim
);
...
...
@@ -44,12 +44,11 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG
<<
" init helper: "
<<
&
cl_helper_
;
DLOG
<<
" conv kernel add kernel ~ "
;
DLOG
<<
" width of one block: "
<<
param
->
Filter
()
->
WidthOfOneBlock
()
;
DLOG
<<
" height of one block: "
<<
param
->
Filter
()
->
HeightOfOneBlock
()
;
DLOG
<<
" width of one block: "
<<
param
->
Filter
()
->
dims
()[
3
]
;
DLOG
<<
" height of one block: "
<<
param
->
Filter
()
->
dims
()[
2
]
;
DLOG
<<
" filter dims: "
<<
param
->
Filter
()
->
dims
();
if
(
param
->
Filter
()
->
WidthOfOneBlock
()
==
1
&&
param
->
Filter
()
->
HeightOfOneBlock
()
==
1
)
{
if
(
param
->
Filter
()
->
dims
()[
2
]
==
1
&&
param
->
Filter
()
->
dims
()[
3
]
==
1
)
{
DLOG
<<
" here1 "
;
this
->
cl_helper_
.
AddKernel
(
"conv_1x1"
,
"conv_kernel.cl"
);
...
...
@@ -59,8 +58,8 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
DLOG
<<
" here2 "
;
this
->
cl_helper_
.
AddKernel
(
"depth_conv_3x3"
,
"depthwise_conv_kernel.cl"
);
}
else
if
(
param
->
Filter
()
->
WidthOfOneBlock
()
==
3
&&
param
->
Filter
()
->
HeightOfOneBlock
()
==
3
)
{
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
DLOG
<<
" here3 "
;
this
->
cl_helper_
.
AddKernel
(
"conv_3x3"
,
"conv_kernel.cl"
);
...
...
@@ -84,13 +83,15 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> ¶m) {
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
input_c
=
param
.
Input
()
->
CBlock
();
int
input_c
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Input
()
->
Converter
())
->
GetCBlock
();
int
dilation
=
param
.
Dilations
()[
0
];
int
input_width
=
param
.
Input
()
->
WidthOfOneBlock
();
int
input_height
=
param
.
Input
()
->
HeightOfOneBlock
();
int
output_width
=
param
.
Output
()
->
WidthOfOneBlock
();
int
output_height
=
param
.
Output
()
->
HeightOfOneBlock
();
int
input_width
=
param
.
Input
()
->
dims
()[
3
];
int
input_height
=
param
.
Input
()
->
dims
()[
2
];
int
output_width
=
param
.
Output
()
->
dims
()[
3
];
int
output_height
=
param
.
Output
()
->
dims
()[
2
];
cl_int
status
;
...
...
@@ -122,13 +123,12 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> ¶m) {
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
output_width
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
output_height
);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
...
...
src/operators/kernel/cl/depthwise_conv_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -50,12 +50,15 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(
auto
output
=
param
.
Output
()
->
GetCLImage
();
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
input_c
=
param
.
Input
()
->
CBlock
();
int
input_c
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Input
()
->
Converter
())
->
GetCBlock
();
int
dilation
=
param
.
Dilations
()[
0
];
int
input_width
=
param
.
Input
()
->
WidthOfOneBlock
();
int
input_height
=
param
.
Input
()
->
HeightOfOneBlock
();
int
output_width
=
param
.
Output
()
->
WidthOfOneBlock
();
int
output_height
=
param
.
Output
()
->
HeightOfOneBlock
();
int
input_width
=
param
.
Input
()
->
dims
()[
3
];
int
input_height
=
param
.
Input
()
->
dims
()[
2
];
int
output_width
=
param
.
Output
()
->
dims
()[
3
];
int
output_height
=
param
.
Output
()
->
dims
()[
2
];
cl_int
status
;
...
...
@@ -76,12 +79,12 @@ void DepthwiseConvKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS
(
status
);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
...
...
src/operators/kernel/cl/feed_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -30,7 +30,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
cl_int
status
;
auto
output
=
param
.
Out
();
const
Tensor
*
input
=
param
.
InputX
();
// DLOG << *input;
// DLOG << *input;
const
float
*
input_data
=
input
->
data
<
float
>
();
int
numel
=
input
->
numel
();
cl_mem
cl_image
=
output
->
GetCLImage
();
...
...
@@ -52,7 +52,7 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
size_t
global_work_size
[
2
]
=
{
width
,
height
};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event out_event = param.Out()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
...
...
src/operators/kernel/cl/fetch_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -14,8 +14,8 @@ limitations under the License. */
#include "operators/kernel/fetch_kernel.h"
#include "framework/cl/cl_tensor.h"
//#include "common/common.h"
//#include <iostream>
//
#include "common/common.h"
//
#include <iostream>
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -75,22 +75,22 @@ void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> ¶m) {
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
&
size_batch
);
}
// cl_event wait_event = param.InpdutX()->GetClEvent();
// cl_event wait_event = param.InpdutX()->GetClEvent();
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
// auto time1 = paddle_mobile::time();
// auto time1 = paddle_mobile::time();
// printf(" before finish \n");
// clFlsh(this->cl_helper_.CLCommandQueue());
// printf(" before finish \n");
// clFlsh(this->cl_helper_.CLCommandQueue());
clFinish
(
this
->
cl_helper_
.
CLCommandQueue
());
// printf(" after finish \n");
// printf(" after finish \n");
// auto time2 = paddle_mobile::time();
//
//
// std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2)
// << "ms" << std::endl;
// auto time2 = paddle_mobile::time();
//
//
// std::cout << " finish cost :" << paddle_mobile::time_diff(time1, time2)
// << "ms" << std::endl;
memcpy
(
out
->
data
<
float
>
(),
out_cl_tensor
.
Data
<
float
>
(),
out
->
memory_size
());
}
...
...
src/operators/kernel/cl/pool_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -34,10 +34,17 @@ void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> ¶m) {
auto
input
=
param
.
Input
()
->
GetCLImage
();
auto
out
=
param
.
Output
()
->
GetCLImage
();
const
int
in_height
=
param
.
Input
()
->
HeightOfOneBlock
();
const
int
in_width
=
param
.
Input
()
->
WidthOfOneBlock
();
const
int
out_height
=
param
.
Output
()
->
HeightOfOneBlock
();
const
int
out_width
=
param
.
Output
()
->
WidthOfOneBlock
();
framework
::
CLImageConverterFolder
*
input_folder_converter
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Input
()
->
Converter
());
framework
::
CLImageConverterFolder
*
output_folder_converter
=
reinterpret_cast
<
framework
::
CLImageConverterFolder
*>
(
param
.
Output
()
->
Converter
());
const
int
in_height
=
input_folder_converter
->
HeightOfOneBlock
();
const
int
in_width
=
input_folder_converter
->
WidthOfOneBlock
();
const
int
out_height
=
output_folder_converter
->
HeightOfOneBlock
();
const
int
out_width
=
output_folder_converter
->
WidthOfOneBlock
();
std
::
string
pooling_type
=
param
.
PoolingType
();
std
::
vector
<
int
>
ksize
=
param
.
Ksize
();
...
...
@@ -63,8 +70,8 @@ void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> ¶m) {
clSetKernelArg
(
kernel
,
10
,
sizeof
(
cl_mem
),
&
input
);
clSetKernelArg
(
kernel
,
11
,
sizeof
(
cl_mem
),
&
out
);
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
// cl_event out_event = param.Output()->GetClEvent();
// cl_event wait_event = param.Input()->GetClEvent();
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
}
...
...
src/operators/kernel/cl/relu_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -21,41 +21,41 @@ namespace operators {
template
<
>
bool
ReluKernel
<
GPU_CL
,
float
>::
Init
(
ReluParam
<
GPU_CL
>*
param
)
{
this
->
cl_helper_
.
AddKernel
(
"relu"
,
"relu.cl"
);
// this->cl_helper_.AddKernel("relu_p0", "relu.cl");
// this->cl_helper_.AddKernel("relu_p1", "relu.cl");
// const auto dim =
// const_cast<framework::CLImage*>(param->InputX())->ImageDims();
// param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
// this->cl_helper_.CLCommandQueue(), dim);
// this->cl_helper_.AddKernel("relu_p0", "relu.cl");
// this->cl_helper_.AddKernel("relu_p1", "relu.cl");
// const auto dim =
// const_cast<framework::CLImage*>(param->InputX())->ImageDims();
// param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
// this->cl_helper_.CLCommandQueue(),
// dim);
return
true
;
}
template
<
>
void
ReluKernel
<
GPU_CL
,
float
>::
Compute
(
const
ReluParam
<
GPU_CL
>&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
// auto kernel_p0 = this->cl_helper_.KernelAt(1);
// auto kernel_p1 = this->cl_helper_.KernelAt(2);
// auto kernel_p0 = this->cl_helper_.KernelAt(1);
// auto kernel_p1 = this->cl_helper_.KernelAt(2);
const
auto
*
input
=
param
.
InputX
();
auto
*
output
=
param
.
Out
();
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
output
);
auto
inputImage
=
input
->
GetCLImage
();
auto
outputImage
=
output
->
GetCLImage
();
// auto tImage =
// const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
inputImage
);
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
outputImage
);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
const
size_t
work_size
[
2
]
=
{
input
->
ImageWidth
(),
input
->
ImageHeight
()};
// auto tImage =
// const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
inputImage
);
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
outputImage
);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
// clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
// clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
const
size_t
work_size
[
2
]
=
{
input
->
ImageWidth
(),
input
->
ImageHeight
()};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3,
// NULL,
// work_size, NULL, 0, NULL, NULL);
...
...
src/operators/kernel/cl/reshape_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -55,8 +55,8 @@ void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> ¶m) {
clSetKernelArg
(
kernel
,
9
,
sizeof
(
cl_int
),
&
odims
[
1
]);
const
size_t
work_size
[
2
]
=
{
output
->
ImageWidth
(),
output
->
ImageHeight
()};
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
work_size
,
NULL
,
0
,
NULL
,
NULL
);
...
...
src/operators/kernel/cl/softmax_kernel.cpp
浏览文件 @
4001ca6f
...
...
@@ -42,27 +42,27 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> ¶m) {
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
outputImage
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
group
);
// const auto &inputDim = input->dims();
//
// int dims[4] = {1, 1, 1, 1};
//
// for (int i = 0; i < inputDim.size(); i++) {
// dims[4 - inputDim.size() + i] = inputDim[i];
// }
//
// clSetKernelArg(kernel, 2, sizeof(int), &dims);
// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
// const auto &inputDim = input->dims();
//
// int dims[4] = {1, 1, 1, 1};
//
// for (int i = 0; i < inputDim.size(); i++) {
// dims[4 - inputDim.size() + i] = inputDim[i];
// }
//
// clSetKernelArg(kernel, 2, sizeof(int), &dims);
// clSetKernelArg(kernel, 3, sizeof(int), &dims[1]);
// clSetKernelArg(kernel, 4, sizeof(int), &dims[2]);
// clSetKernelArg(kernel, 5, sizeof(int), &dims[3]);
// cl_event out_event = param.Out()->GetClEvent();
// cl_event wait_event = param.InputX()->GetClEvent();
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
default_work_size
.
size
(),
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
template
class
SoftmaxKernel
<
GPU_CL
,
float
>;
...
...
src/operators/math/depthwise_conv_3x3.cpp
浏览文件 @
4001ca6f
...
...
@@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor
*
output
,
const
Tensor
*
new_scale
,
const
Tensor
*
new_bias
,
bool
if_relu
)
{
#if __ARM_NEON
//#ifdef _OPENMP
//
#ifdef _OPENMP
// const float *newscale_data = new_scale->data<float>();
// const float *newbias_data = new_bias->data<float>();
//
...
...
@@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
// }
// }
//
//#else
//
#else
const
float
*
input_data
=
input
->
data
<
float
>
();
const
float
*
filter_data
=
filter
->
data
<
float
>
();
...
...
@@ -1877,7 +1877,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
input_data
+=
inhxw
*
c
;
output_data
+=
outhxw
*
c
;
}
//#endif
//
#endif
#endif
}
...
...
test/net/test_mobilenet_GPU.cpp
浏览文件 @
4001ca6f
...
...
@@ -33,23 +33,27 @@ int main() {
std
::
vector
<
int64_t
>
dims
{
1
,
3
,
224
,
224
};
GetInput
<
float
>
(
g_test_image_1x3x224x224_banana
,
&
input
,
dims
);
std
::
vector
<
float
>
vec_result
;
// = paddle_mobile.Predict(input, dims);
auto
time3
=
paddle_mobile
::
time
();
auto
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
int
max
=
1
;
for
(
int
i
=
0
;
i
<
max
;
++
i
)
{
vec_result
=
paddle_mobile
.
Predict
(
input
,
dims
);
}
auto
time4
=
paddle_mobile
::
time
();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time3 = paddle_mobile::time();
// auto time3 = paddle_mobile::time();
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// for (int i = 0; i < 10; ++i) {
// auto vec_result = paddle_mobile.Predict(input, dims);
// }
// auto time4 = paddle_mobile::time();
// auto time4 = paddle_mobile::time();
std
::
cout
<<
"predict cost :"
<<
paddle_mobile
::
time_diff
(
time3
,
time4
)
<<
"ms"
<<
std
::
endl
;
std
::
cout
<<
"predict cost :"
<<
paddle_mobile
::
time_diff
(
time3
,
time4
)
/
max
<<
"ms"
<<
std
::
endl
;
std
::
vector
<
float
>::
iterator
biggest
=
std
::
max_element
(
std
::
begin
(
vec_result
),
std
::
end
(
vec_result
));
std
::
cout
<<
" Max element is "
<<
*
biggest
<<
" at position "
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录