Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
b1e877e9
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
337
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b1e877e9
编写于
10月 17, 2018
作者:
D
dolphin8
浏览文件
操作
浏览文件
下载
差异文件
merge
上级
386e6b42
752befc4
变更
31
隐藏空白更改
内联
并排
Showing
31 changed file
with
1007 addition
and
169 deletion
+1007
-169
CMakeLists.txt
CMakeLists.txt
+5
-6
src/framework/cl/cl_half.cpp
src/framework/cl/cl_half.cpp
+6
-0
src/framework/cl/cl_half.h
src/framework/cl/cl_half.h
+6
-0
src/framework/cl/cl_helper.h
src/framework/cl/cl_helper.h
+10
-0
src/framework/cl/cl_image.cpp
src/framework/cl/cl_image.cpp
+64
-4
src/framework/cl/cl_image.h
src/framework/cl/cl_image.h
+33
-24
src/framework/cl/cl_tensor.h
src/framework/cl/cl_tensor.h
+44
-8
src/framework/executor.cpp
src/framework/executor.cpp
+8
-5
src/framework/operator.cpp
src/framework/operator.cpp
+4
-20
src/io/ios_io/PaddleMobileCPU.h
src/io/ios_io/PaddleMobileCPU.h
+0
-0
src/io/ios_io/PaddleMobileCPU.mm
src/io/ios_io/PaddleMobileCPU.mm
+0
-0
src/io/ios_io/op_symbols.h
src/io/ios_io/op_symbols.h
+0
-0
src/io/jni/PML.java
src/io/jni/PML.java
+0
-0
src/io/jni/paddle_mobile_jni.cpp
src/io/jni/paddle_mobile_jni.cpp
+0
-0
src/io/jni/paddle_mobile_jni.h
src/io/jni/paddle_mobile_jni.h
+0
-0
src/operators/kernel/cl/batchnorm_kernel.cpp
src/operators/kernel/cl/batchnorm_kernel.cpp
+4
-2
src/operators/kernel/cl/cl_kernel/cl_common.h
src/operators/kernel/cl/cl_kernel/cl_common.h
+2
-4
src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
+322
-4
src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
+320
-2
src/operators/kernel/cl/cl_kernel/conv_kernel.cl
src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+1
-0
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+54
-6
src/operators/kernel/cl/conv_add_kernel.cpp
src/operators/kernel/cl/conv_add_kernel.cpp
+32
-4
src/operators/kernel/cl/conv_kernel.cpp
src/operators/kernel/cl/conv_kernel.cpp
+13
-1
src/operators/kernel/cl/depthwise_conv_kernel.cpp
src/operators/kernel/cl/depthwise_conv_kernel.cpp
+2
-1
src/operators/kernel/cl/elementwise_add_kernel.cpp
src/operators/kernel/cl/elementwise_add_kernel.cpp
+20
-19
src/operators/kernel/cl/feed_kernel.cpp
src/operators/kernel/cl/feed_kernel.cpp
+3
-9
src/operators/kernel/cl/fetch_kernel.cpp
src/operators/kernel/cl/fetch_kernel.cpp
+34
-33
src/operators/kernel/cl/reshape_kernel.cpp
src/operators/kernel/cl/reshape_kernel.cpp
+10
-10
src/operators/kernel/cl/softmax_kernel.cpp
src/operators/kernel/cl/softmax_kernel.cpp
+8
-5
test/net/test_mobilenet_GPU.cpp
test/net/test_mobilenet_GPU.cpp
+1
-1
tools/pre-commit.hooks/clang-format.hook
tools/pre-commit.hooks/clang-format.hook
+1
-1
未找到文件。
CMakeLists.txt
浏览文件 @
b1e877e9
...
@@ -16,7 +16,6 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
...
@@ -16,7 +16,6 @@ file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file
(
GLOB_RECURSE PADDLE_MOBILE_H src/*.h
)
file
(
GLOB_RECURSE PADDLE_MOBILE_H src/*.h
)
include_directories
(
src/
)
include_directories
(
src/
)
if
(
IS_IOS
)
if
(
IS_IOS
)
set
(
CMAKE_CXX_FLAGS
"-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot
${
CMAKE_OSX_SYSROOT
}
${
CMAKE_CXX_FLAGS
}
"
)
set
(
CMAKE_CXX_FLAGS
"-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot
${
CMAKE_OSX_SYSROOT
}
${
CMAKE_CXX_FLAGS
}
"
)
else
()
else
()
...
@@ -145,16 +144,16 @@ endif()
...
@@ -145,16 +144,16 @@ endif()
if
(
ANDROID_NDK_TOOLCHAIN_INCLUDED
)
if
(
ANDROID_NDK_TOOLCHAIN_INCLUDED
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-llog"
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-llog"
)
else
()
else
()
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/jni/paddle_mobile_jni.h
)
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/
io/
jni/paddle_mobile_jni.h
)
list
(
REMOVE_ITEM PADDLE_MOBILE_CC
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/jni/paddle_mobile_jni.cpp
)
list
(
REMOVE_ITEM PADDLE_MOBILE_CC
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/
io/
jni/paddle_mobile_jni.cpp
)
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/operators/math/math_func_neon.h
)
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/operators/math/math_func_neon.h
)
endif
()
endif
()
if
(
IS_IOS
)
if
(
IS_IOS
)
else
()
else
()
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/ios_io/PaddleMobileCPU.h
)
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/io
/io
s_io/PaddleMobileCPU.h
)
list
(
REMOVE_ITEM PADDLE_MOBILE_CC
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/ios_io/PaddleMobileCPU.mm
)
list
(
REMOVE_ITEM PADDLE_MOBILE_CC
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/io
/io
s_io/PaddleMobileCPU.mm
)
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/ios_io/op_symbols.h
)
list
(
REMOVE_ITEM PADDLE_MOBILE_H
${
CMAKE_CURRENT_SOURCE_DIR
}
/src/io
/io
s_io/op_symbols.h
)
endif
()
endif
()
set
(
CMAKE_VERBOSE_MAKEFILE ON
)
set
(
CMAKE_VERBOSE_MAKEFILE ON
)
...
...
src/framework/cl/cl_half.cpp
浏览文件 @
b1e877e9
...
@@ -16,6 +16,9 @@ limitations under the License. */
...
@@ -16,6 +16,9 @@ limitations under the License. */
#include "framework/cl/cl_half.h"
#include "framework/cl/cl_half.h"
namespace
paddle_mobile
{
namespace
framework
{
static
const
uint32_t
mantissatable
[
2048
]
=
{
static
const
uint32_t
mantissatable
[
2048
]
=
{
0x00000000
,
0x33800000
,
0x34000000
,
0x34400000
,
0x34800000
,
0x34a00000
,
0x00000000
,
0x33800000
,
0x34000000
,
0x34400000
,
0x34800000
,
0x34a00000
,
0x34c00000
,
0x34e00000
,
0x35000000
,
0x35100000
,
0x35200000
,
0x35300000
,
0x34c00000
,
0x34e00000
,
0x35000000
,
0x35100000
,
0x35200000
,
0x35300000
,
...
@@ -510,3 +513,6 @@ void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
...
@@ -510,3 +513,6 @@ void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
f_array
[
i
]
=
Half2Float
(
h_array
[
i
]);
f_array
[
i
]
=
Half2Float
(
h_array
[
i
]);
}
}
}
}
}
// namespace framework
}
// namespace paddle_mobile
src/framework/cl/cl_half.h
浏览文件 @
b1e877e9
...
@@ -15,6 +15,9 @@ limitations under the License. */
...
@@ -15,6 +15,9 @@ limitations under the License. */
#pragma once
#pragma once
#include <cstdint>
#include <cstdint>
namespace
paddle_mobile
{
namespace
framework
{
typedef
uint16_t
half_t
;
typedef
uint16_t
half_t
;
half_t
Float2Half
(
float
f
);
half_t
Float2Half
(
float
f
);
...
@@ -24,3 +27,6 @@ float Half2Float(half_t h);
...
@@ -24,3 +27,6 @@ float Half2Float(half_t h);
void
FloatArray2HalfArray
(
float
*
f_array
,
half_t
*
h_array
,
int
count
);
void
FloatArray2HalfArray
(
float
*
f_array
,
half_t
*
h_array
,
int
count
);
void
HalfArray2FloatArray
(
half_t
*
h_array
,
float
*
f_array
,
int
count
);
void
HalfArray2FloatArray
(
half_t
*
h_array
,
float
*
f_array
,
int
count
);
}
// namespace framework
}
// namespace paddle_mobile
src/framework/cl/cl_helper.h
浏览文件 @
b1e877e9
...
@@ -64,6 +64,16 @@ class CLHelper {
...
@@ -64,6 +64,16 @@ class CLHelper {
auto
work_size_2
=
n
*
h
;
auto
work_size_2
=
n
*
h
;
return
{
work_size_0
,
work_size_1
,
work_size_2
};
}
else
if
(
image_dim
.
size
()
==
2
)
{
auto
image_width
=
image
.
ImageWidth
();
auto
work_size_0
=
image_width
/
image_dim
[
1
];
auto
work_size_1
=
image_dim
[
1
];
auto
work_size_2
=
image_dim
[
0
];
return
{
work_size_0
,
work_size_1
,
work_size_2
};
return
{
work_size_0
,
work_size_1
,
work_size_2
};
}
}
PADDLE_MOBILE_THROW_EXCEPTION
(
"not support this dim, need imp"
);
PADDLE_MOBILE_THROW_EXCEPTION
(
"not support this dim, need imp"
);
...
...
src/framework/cl/cl_image.cpp
浏览文件 @
b1e877e9
...
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "cl_image.h"
#include "framework/cl/cl_image.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
framework
{
namespace
framework
{
void
CLImageToTensor
(
CLImage
*
cl_image
,
Tensor
*
tensor
,
void
CLImageToTensor
(
CLImage
*
cl_image
,
Tensor
*
tensor
,
...
@@ -37,7 +38,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
...
@@ -37,7 +38,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
size_t
height
=
H
*
N
;
float
*
p
=
tensor
->
data
<
float
>
();
float
*
p
=
tensor
->
mutable_
data
<
float
>
();
half
imageData
[
width
*
height
*
4
];
half
imageData
[
width
*
height
*
4
];
cl_int
err
;
cl_int
err
;
cl_mem
image
=
cl_image
->
GetCLImage
();
cl_mem
image
=
cl_image
->
GetCLImage
();
...
@@ -63,7 +64,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
...
@@ -63,7 +64,7 @@ void CLImageToTensor(CLImage *cl_image, Tensor *tensor,
}
}
if
(
err
!=
CL_SUCCESS
)
{
if
(
err
!=
CL_SUCCESS
)
{
// TODO: error handling
CL_CHECK_ERRORS
(
err
);
}
}
}
}
void
TensorToCLImage
(
const
Tensor
*
tensor
,
CLImage
*
cl_image
,
void
TensorToCLImage
(
const
Tensor
*
tensor
,
CLImage
*
cl_image
,
...
@@ -97,7 +98,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
...
@@ -97,7 +98,7 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
err
=
clEnqueueReadImage
(
commandQueue
,
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
err
=
clEnqueueReadImage
(
commandQueue
,
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
imageData
,
0
,
NULL
,
NULL
);
imageData
,
0
,
NULL
,
NULL
);
if
(
err
!=
CL_SUCCESS
)
{
if
(
err
!=
CL_SUCCESS
)
{
// TODO: error handling
CL_CHECK_ERRORS
(
err
);
}
}
size_t
i0
=
0
;
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
...
@@ -116,5 +117,64 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
...
@@ -116,5 +117,64 @@ void TensorToCLImage(const Tensor *tensor, CLImage *cl_image,
i0
+=
width
*
H
;
i0
+=
width
*
H
;
}
}
}
}
#ifdef PADDLE_MOBILE_DEBUG
Print
&
operator
<<
(
Print
&
printer
,
const
CLImage
&
cl_image
)
{
printer
<<
" dims: "
<<
cl_image
.
dims
()
<<
"
\n
"
;
int
stride
=
cl_image
.
numel
()
/
20
;
stride
=
stride
>
0
?
stride
:
1
;
float
*
data
=
new
float
[
cl_image
.
numel
()];
DDim
ddim
=
cl_image
.
dims
();
size_t
N
,
C
,
H
,
W
;
if
(
ddim
.
size
()
==
4
)
{
N
=
ddim
[
0
];
if
(
N
<
0
)
{
N
=
1
;
}
C
=
ddim
[
1
];
H
=
ddim
[
2
];
W
=
ddim
[
3
];
}
else
if
(
ddim
.
size
()
==
1
)
{
N
=
1
;
C
=
ddim
[
0
];
H
=
1
;
W
=
1
;
}
size_t
width
=
W
*
((
C
+
3
)
/
4
);
size_t
height
=
H
*
N
;
float
*
p
=
data
;
half
imageData
[
width
*
height
*
4
];
cl_int
err
;
cl_mem
image
=
cl_image
.
GetCLImage
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
width
,
height
,
1
};
err
=
clEnqueueReadImage
(
cl_image
.
CommandQueue
(),
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
imageData
,
0
,
NULL
,
NULL
);
size_t
i0
=
0
;
for
(
int
n
=
0
;
n
<
N
;
n
++
)
{
for
(
int
c
=
0
;
c
<
C
;
c
++
)
{
size_t
i1
=
i0
;
for
(
int
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
int
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
imageData
[
i2
]);
i2
+=
4
;
p
++
;
}
i1
+=
width
;
}
}
i0
+=
width
*
H
;
}
CL_CHECK_ERRORS
(
err
);
for
(
int
i
=
0
;
i
<
cl_image
.
numel
();
i
+=
stride
)
{
printer
<<
data
[
i
]
<<
" "
;
}
return
printer
;
}
#endif
}
// namespace framework
}
// namespace framework
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/framework/cl/cl_image.h
浏览文件 @
b1e877e9
...
@@ -46,27 +46,28 @@ class CLImage {
...
@@ -46,27 +46,28 @@ class CLImage {
/*
/*
* need call SetTensorData first
* need call SetTensorData first
* */
* */
void
InitCLImage
(
cl_context
context
)
{
void
InitCLImage
(
cl_context
context
,
cl_command_queue
command_queue
)
{
if
(
tensor_data_
==
nullptr
)
{
if
(
tensor_data_
==
nullptr
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
" need call SetTensorData first"
);
PADDLE_MOBILE_THROW_EXCEPTION
(
" need call SetTensorData first"
);
}
}
if
(
tensor_dims_
.
size
()
<=
2
)
{
if
(
tensor_dims_
.
size
()
<=
2
)
{
InitCLImage2C
(
context
,
tensor_data_
,
tensor_dims_
);
InitCLImage2C
(
context
,
command_queue
,
tensor_data_
,
tensor_dims_
);
}
else
{
}
else
{
InitCLImage
(
context
,
tensor_data_
,
tensor_dims_
);
InitCLImage
(
context
,
command_queue
,
tensor_data_
,
tensor_dims_
);
}
}
delete
[](
tensor_data_
);
delete
[](
tensor_data_
);
tensor_data_
=
nullptr
;
tensor_data_
=
nullptr
;
initialized_
=
true
;
initialized_
=
true
;
}
}
void
InitEmptyImage
(
cl_context
context
,
const
DDim
&
dim
)
{
void
InitEmptyImage
(
cl_context
context
,
cl_command_queue
command_queue
,
const
DDim
&
dim
)
{
if
(
tensor_data_
!=
nullptr
)
{
if
(
tensor_data_
!=
nullptr
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
PADDLE_MOBILE_THROW_EXCEPTION
(
" empty image tensor data shouldn't have value"
);
" empty image tensor data shouldn't have value"
);
}
}
DLOG
<<
" init empty image "
;
DLOG
<<
" init empty image "
;
InitCLImage
(
context
,
nullptr
,
dim
);
InitCLImage
(
context
,
command_queue
,
nullptr
,
dim
);
initialized_
=
true
;
initialized_
=
true
;
}
}
...
@@ -93,6 +94,8 @@ class CLImage {
...
@@ -93,6 +94,8 @@ class CLImage {
* */
* */
inline
size_t
HeightOfOneBlock
()
const
{
return
height_of_one_block_
;
}
inline
size_t
HeightOfOneBlock
()
const
{
return
height_of_one_block_
;
}
inline
cl_command_queue
CommandQueue
()
const
{
return
command_queue_
;
}
/*
/*
* resize original tensor dim
* resize original tensor dim
* */
* */
...
@@ -122,7 +125,9 @@ class CLImage {
...
@@ -122,7 +125,9 @@ class CLImage {
const
DDim
&
dims
()
const
{
return
tensor_dims_
;
}
const
DDim
&
dims
()
const
{
return
tensor_dims_
;
}
private:
private:
void
InitCLImage2C
(
cl_context
context
,
float
*
tensor_data
,
const
DDim
&
dim
)
{
void
InitCLImage2C
(
cl_context
context
,
cl_command_queue
command_queue
,
float
*
tensor_data
,
const
DDim
&
dim
)
{
command_queue_
=
command_queue
;
assert
(
dim
.
size
()
<=
2
);
assert
(
dim
.
size
()
<=
2
);
int
tdim
[
2
]
=
{
1
,
1
};
int
tdim
[
2
]
=
{
1
,
1
};
if
(
dim
.
size
()
==
1
)
{
if
(
dim
.
size
()
==
1
)
{
...
@@ -138,7 +143,8 @@ class CLImage {
...
@@ -138,7 +143,8 @@ class CLImage {
imageData
.
reset
(
new
half_t
[
width
*
height
*
4
]);
imageData
.
reset
(
new
half_t
[
width
*
height
*
4
]);
for
(
int
h
=
0
;
h
<
tdim
[
0
];
h
++
)
{
for
(
int
h
=
0
;
h
<
tdim
[
0
];
h
++
)
{
for
(
int
w
=
0
;
w
<
tdim
[
1
];
w
++
)
{
for
(
int
w
=
0
;
w
<
tdim
[
1
];
w
++
)
{
imageData
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
Float2Half
(
tensor_data
[
h
*
tdim
[
1
]
+
w
]);
imageData
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
Float2Half
(
tensor_data
[
h
*
tdim
[
1
]
+
w
]);
}
}
}
}
}
}
...
@@ -149,35 +155,36 @@ class CLImage {
...
@@ -149,35 +155,36 @@ class CLImage {
cl_image_format
cf
=
{.
image_channel_order
=
CL_RGBA
,
cl_image_format
cf
=
{.
image_channel_order
=
CL_RGBA
,
.
image_channel_data_type
=
CL_HALF_FLOAT
};
.
image_channel_data_type
=
CL_HALF_FLOAT
};
cl_image_desc
cid
=
{
cl_image_desc
cid
=
{
.
image_type
=
CL_MEM_OBJECT_IMAGE2D
,
.
image_type
=
CL_MEM_OBJECT_IMAGE2D
,
.
image_width
=
width
,
.
image_width
=
width
,
.
image_height
=
height
,
.
image_height
=
height
,
.
image_depth
=
1
,
.
image_depth
=
1
,
.
image_array_size
=
1
,
.
image_array_size
=
1
,
.
image_row_pitch
=
0
,
.
image_row_pitch
=
0
,
.
image_slice_pitch
=
0
,
.
image_slice_pitch
=
0
,
.
num_mip_levels
=
0
,
.
num_mip_levels
=
0
,
.
num_samples
=
0
,
.
num_samples
=
0
,
// .buffer = nullptr
// .buffer = nullptr
};
};
cid
.
buffer
=
nullptr
;
cid
.
buffer
=
nullptr
;
cl_int
err
;
cl_int
err
;
cl_image_
=
clCreateImage
(
cl_image_
=
clCreateImage
(
context
,
CL_MEM_READ_WRITE
|
(
data
?
CL_MEM_COPY_HOST_PTR
:
0
),
context
,
CL_MEM_READ_WRITE
|
(
data
?
CL_MEM_COPY_HOST_PTR
:
0
),
&
cf
,
// const cl_image_format *image_format
&
cf
,
// const cl_image_format *image_format
&
cid
,
// const cl_image_desc *image_desc
&
cid
,
// const cl_image_desc *image_desc
data
,
// void *host_ptr
data
,
// void *host_ptr
&
err
&
err
);
);
if
(
err
!=
CL_SUCCESS
)
{
if
(
err
!=
CL_SUCCESS
)
{
CL_CHECK_ERRORS
(
err
);
CL_CHECK_ERRORS
(
err
);
PADDLE_MOBILE_THROW_EXCEPTION
(
" create image 2d error "
);
PADDLE_MOBILE_THROW_EXCEPTION
(
" create image 2d error "
);
}
}
}
}
void
InitCLImage
(
cl_context
context
,
float
*
tensor_data
,
const
DDim
&
dim
)
{
void
InitCLImage
(
cl_context
context
,
cl_command_queue
command_queue
,
float
*
tensor_data
,
const
DDim
&
dim
)
{
DLOG
<<
" tensor dim: "
<<
dim
;
DLOG
<<
" tensor dim: "
<<
dim
;
// NCHW -> [W * (C+3)/4, H * N]
// NCHW -> [W * (C+3)/4, H * N]
tensor_dims_
=
dim
;
tensor_dims_
=
dim
;
command_queue_
=
command_queue
;
if
(
tensor_data
)
{
if
(
tensor_data
)
{
tensor_data_
=
tensor_data
;
tensor_data_
=
tensor_data
;
}
}
...
@@ -203,6 +210,7 @@ class CLImage {
...
@@ -203,6 +210,7 @@ class CLImage {
image_width_
=
width
;
image_width_
=
width
;
image_height_
=
height
;
image_height_
=
height
;
image_dims_
=
make_ddim
({
image_width_
,
image_height_
});
image_dims_
=
make_ddim
({
image_width_
,
image_height_
});
c_block_
=
W
/
width
;
std
::
unique_ptr
<
half_t
[]
>
imageData
{};
std
::
unique_ptr
<
half_t
[]
>
imageData
{};
int
count
=
0
;
int
count
=
0
;
...
@@ -241,6 +249,7 @@ class CLImage {
...
@@ -241,6 +249,7 @@ class CLImage {
DDim
image_dims_
;
DDim
image_dims_
;
float
*
tensor_data_
;
float
*
tensor_data_
;
cl_context
context_
;
cl_context
context_
;
cl_command_queue
command_queue_
;
};
};
void
TensorToCLImage
(
Tensor
*
tensor
,
CLImage
*
image
,
void
TensorToCLImage
(
Tensor
*
tensor
,
CLImage
*
image
,
...
...
src/framework/cl/cl_tensor.h
浏览文件 @
b1e877e9
...
@@ -28,7 +28,19 @@ namespace framework {
...
@@ -28,7 +28,19 @@ namespace framework {
class
CLTensor
:
TensorBase
{
class
CLTensor
:
TensorBase
{
public:
public:
explicit
CLTensor
(
cl_context
context
)
:
context_
(
context
)
{}
CLTensor
(
cl_context
context
,
cl_command_queue
command_queue
)
:
context_
(
context
),
command_queue_
(
command_queue
)
{}
CLTensor
()
=
default
;
/*
* if init method haven't set context and command_queue, need set
* */
void
SetContextAndCommandQueue
(
cl_context
context
,
cl_command_queue
command_queue
)
{
context_
=
context
;
command_queue_
=
command_queue
;
}
/*! Resize the dimensions of the memory block. */
/*! Resize the dimensions of the memory block. */
inline
CLTensor
&
Resize
(
const
DDim
&
dims
)
{
inline
CLTensor
&
Resize
(
const
DDim
&
dims
)
{
...
@@ -39,7 +51,8 @@ class CLTensor : TensorBase {
...
@@ -39,7 +51,8 @@ class CLTensor : TensorBase {
template
<
typename
T
>
template
<
typename
T
>
inline
T
mutable_with_data
(
void
*
data
)
{
inline
T
mutable_with_data
(
void
*
data
)
{
int64_t
size
=
numel
()
*
sizeof
(
float
);
int64_t
size
=
numel
()
*
sizeof
(
float
);
holder_
.
reset
(
new
PlaceholderImpl
(
size
,
data
,
typeid
(
T
),
context_
));
holder_
.
reset
(
new
PlaceholderImpl
(
size
,
data
,
typeid
(
T
),
context_
,
command_queue_
));
return
reinterpret_cast
<
T
>
(
return
reinterpret_cast
<
T
>
(
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())));
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
uintptr_t
>
(
holder_
->
ptr
())));
}
}
...
@@ -51,7 +64,7 @@ class CLTensor : TensorBase {
...
@@ -51,7 +64,7 @@ class CLTensor : TensorBase {
PADDLE_MOBILE_ENFORCE
(
numel
()
>=
0
,
"the Tensor's numel must >=0."
)
PADDLE_MOBILE_ENFORCE
(
numel
()
>=
0
,
"the Tensor's numel must >=0."
)
int64_t
size
=
numel
()
*
SizeOfType
(
type
);
int64_t
size
=
numel
()
*
SizeOfType
(
type
);
if
(
holder_
==
nullptr
||
holder_
->
size
()
<
size
+
offset_
)
{
if
(
holder_
==
nullptr
||
holder_
->
size
()
<
size
+
offset_
)
{
holder_
.
reset
(
new
PlaceholderImpl
(
size
,
type
,
context_
));
holder_
.
reset
(
new
PlaceholderImpl
(
size
,
type
,
context_
,
command_queue_
));
offset_
=
0
;
offset_
=
0
;
}
}
return
reinterpret_cast
<
void
*>
(
return
reinterpret_cast
<
void
*>
(
...
@@ -85,6 +98,7 @@ class CLTensor : TensorBase {
...
@@ -85,6 +98,7 @@ class CLTensor : TensorBase {
private:
private:
cl_context
context_
;
cl_context
context_
;
cl_command_queue
command_queue_
;
/*
/*
* virtual ~Placeholder() = default;
* virtual ~Placeholder() = default;
...
@@ -99,20 +113,31 @@ class CLTensor : TensorBase {
...
@@ -99,20 +113,31 @@ class CLTensor : TensorBase {
* */
* */
struct
PlaceholderImpl
:
public
Placeholder
{
struct
PlaceholderImpl
:
public
Placeholder
{
PlaceholderImpl
(
size_t
size
,
void
*
input
,
std
::
type_index
type
,
PlaceholderImpl
(
size_t
size
,
void
*
input
,
std
::
type_index
type
,
cl_context
context
)
cl_context
context
,
cl_command_queue
command_queue
)
:
ptr_
(
clCreateBuffer
(
context
,
CL_MEM_READ_ONLY
|
CL_MEM_COPY_HOST_PTR
,
:
ptr_
(
clCreateBuffer
(
context
,
CL_MEM_READ_ONLY
|
CL_MEM_COPY_HOST_PTR
,
size
,
reinterpret_cast
<
void
*>
(
input
),
NULL
)),
size
,
reinterpret_cast
<
void
*>
(
input
),
NULL
)),
size_
(
size
),
size_
(
size
),
type_
(
type
)
{}
type_
(
type
),
command_queue_
(
command_queue
)
{}
PlaceholderImpl
(
size_t
size
,
std
::
type_index
type
,
cl_context
context
)
PlaceholderImpl
(
size_t
size
,
std
::
type_index
type
,
cl_context
context
,
cl_command_queue
command_queue
)
:
ptr_
(
clCreateBuffer
(
context
,
CL_MEM_READ_WRITE
,
size
,
NULL
,
NULL
)),
:
ptr_
(
clCreateBuffer
(
context
,
CL_MEM_READ_WRITE
,
size
,
NULL
,
NULL
)),
size_
(
size
),
size_
(
size
),
type_
(
type
)
{}
type_
(
type
),
command_queue_
(
command_queue
)
{}
virtual
size_t
size
()
const
{
return
size_
;
}
virtual
size_t
size
()
const
{
return
size_
;
}
virtual
void
*
ptr
()
const
{
return
static_cast
<
void
*>
(
ptr_
.
get
());
}
virtual
void
*
ptr
()
const
{
if
(
host_ptr_
)
{
delete
(
host_ptr_
);
}
char
*
host_ptr
=
new
char
[
size_
];
clEnqueueReadBuffer
(
command_queue_
,
ptr_
.
get
(),
CL_TRUE
,
0
,
size_
,
host_ptr
,
0
,
NULL
,
NULL
);
return
static_cast
<
void
*>
(
host_ptr
);
}
virtual
std
::
type_index
type
()
const
{
return
type_
;
}
virtual
std
::
type_index
type
()
const
{
return
type_
;
}
...
@@ -124,6 +149,17 @@ class CLTensor : TensorBase {
...
@@ -124,6 +149,17 @@ class CLTensor : TensorBase {
/* the current type of memory */
/* the current type of memory */
std
::
type_index
type_
;
std
::
type_index
type_
;
cl_command_queue
command_queue_
;
~
PlaceholderImpl
()
{
if
(
host_ptr_
)
{
delete
(
host_ptr_
);
}
}
private:
void
*
host_ptr_
;
};
};
};
};
...
...
src/framework/executor.cpp
浏览文件 @
b1e877e9
...
@@ -87,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
...
@@ -87,7 +87,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
for
(
int
i
=
0
;
i
<
blocks
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
blocks
.
size
();
++
i
)
{
std
::
shared_ptr
<
framework
::
BlockDesc
>
block_desc
=
blocks
[
i
];
std
::
shared_ptr
<
framework
::
BlockDesc
>
block_desc
=
blocks
[
i
];
std
::
vector
<
std
::
shared_ptr
<
framework
::
OpDesc
>>
ops
=
block_desc
->
Ops
();
std
::
vector
<
std
::
shared_ptr
<
framework
::
OpDesc
>>
ops
=
block_desc
->
Ops
();
for
(
int
j
=
0
;
j
<
debug_to
;
++
j
)
{
for
(
int
j
=
0
;
j
<
ops
.
size
()
;
++
j
)
{
std
::
shared_ptr
<
framework
::
OpDesc
>
op
=
ops
[
j
];
std
::
shared_ptr
<
framework
::
OpDesc
>
op
=
ops
[
j
];
DLOG
<<
"create op: "
<<
j
<<
" "
<<
op
->
Type
();
DLOG
<<
"create op: "
<<
j
<<
" "
<<
op
->
Type
();
auto
op_base
=
framework
::
OpRegistry
<
Dtype
>::
CreateOp
(
auto
op_base
=
framework
::
OpRegistry
<
Dtype
>::
CreateOp
(
...
@@ -416,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
...
@@ -416,7 +416,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
}
}
}
}
#else
#else
for
(
int
i
=
0
;
i
<
debug_to
;
i
++
)
{
for
(
int
i
=
0
;
i
<
ops
.
size
()
;
i
++
)
{
#ifdef PADDLE_MOBILE_PROFILE
#ifdef PADDLE_MOBILE_PROFILE
struct
timespec
ts
;
struct
timespec
ts
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
clock_gettime
(
CLOCK_MONOTONIC
,
&
ts
);
...
@@ -953,12 +953,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
...
@@ -953,12 +953,14 @@ void Executor<GPU_CL, Precision::FP32>::InitMemory() {
if
(
var_desc
->
Type
()
==
framework
::
VARTYPE_TYPE_LOD_TENSOR
)
{
if
(
var_desc
->
Type
()
==
framework
::
VARTYPE_TYPE_LOD_TENSOR
)
{
auto
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
auto
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
cl_command_queue
command_queue
=
program_
.
scope
->
GetCLScpoe
()
->
CommandQueue
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
// framework::DDim ddim = framework::make_ddim(desc.Dims());
framework
::
DDim
ddim
=
cl_image
->
dims
();
framework
::
DDim
ddim
=
cl_image
->
dims
();
DLOG
<<
var_desc
->
Name
();
DLOG
<<
var_desc
->
Name
();
cl_image
->
InitEmptyImage
(
context
,
ddim
);
cl_image
->
InitEmptyImage
(
context
,
command_queue
,
ddim
);
}
}
}
}
}
}
...
@@ -1010,11 +1012,12 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
...
@@ -1010,11 +1012,12 @@ void Executor<GPU_CL, Precision::FP32>::InitCombineMemory() {
}
else
{
}
else
{
auto
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
auto
cl_image
=
var
->
template
GetMutable
<
framework
::
CLImage
>();
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
cl_context
context
=
program_
.
scope
->
GetCLScpoe
()
->
Context
();
cl_command_queue
command_queue
=
program_
.
scope
->
GetCLScpoe
()
->
CommandQueue
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
const
framework
::
TensorDesc
&
desc
=
var_desc
->
Tensor_desc
();
framework
::
DDim
ddim
=
cl_image
->
dims
();
framework
::
DDim
ddim
=
cl_image
->
dims
();
// framework::DDim ddim = framework::make_ddim(desc.Dims());
// framework::DDim ddim = framework::make_ddim(desc.Dims());
cl_image
->
InitEmptyImage
(
context
,
ddim
);
cl_image
->
InitEmptyImage
(
context
,
command_queue
,
ddim
);
}
}
}
}
}
}
...
...
src/framework/operator.cpp
浏览文件 @
b1e877e9
...
@@ -57,10 +57,9 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
...
@@ -57,10 +57,9 @@ void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
template
<
typename
Dtype
>
template
<
typename
Dtype
>
void
OperatorBase
<
Dtype
>::
Run
()
{
void
OperatorBase
<
Dtype
>::
Run
()
{
DLOG
<<
"
begin run "
<<
type_
;
DLOG
<<
"
----- Begin run impl --- "
<<
type_
<<
" ----- "
;
RunImpl
();
RunImpl
();
DLOG
<<
" end run "
<<
type_
;
DLOG
<<
" ----- End run impl --- "
<<
type_
<<
" ----- "
;
return
;
#ifdef PADDLE_MOBILE_DEBUG
#ifdef PADDLE_MOBILE_DEBUG
DLOG
<<
"-------------"
<<
type_
<<
"----------------------------"
;
DLOG
<<
"-------------"
<<
type_
<<
"----------------------------"
;
vector
<
string
>
input_keys
=
GetInputKeys
();
vector
<
string
>
input_keys
=
GetInputKeys
();
...
@@ -75,16 +74,8 @@ void OperatorBase<Dtype>::Run() {
...
@@ -75,16 +74,8 @@ void OperatorBase<Dtype>::Run() {
if
(
tensor
)
DLOG
<<
type_
<<
" input- "
<<
key
<<
"="
<<
*
tensor
;
if
(
tensor
)
DLOG
<<
type_
<<
" input- "
<<
key
<<
"="
<<
*
tensor
;
}
else
{
}
else
{
CLImage
*
cl_image
=
vari
->
template
GetMutable
<
framework
::
CLImage
>();
CLImage
*
cl_image
=
vari
->
template
GetMutable
<
framework
::
CLImage
>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor
// *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
const
float
*
input
=
cl_image
->
data
<
float
>
();
if
(
cl_image
)
{
if
(
cl_image
)
{
DLOG
<<
type_
<<
" input- "
<<
key
<<
"="
<<
cl_image
->
dims
();
DLOG
<<
type_
<<
" input- "
<<
key
<<
"="
<<
*
cl_image
;
// if(input)
// DLOG<<type_<<" input- "<<key<<"="<<*input;
}
}
}
}
...
@@ -108,15 +99,8 @@ void OperatorBase<Dtype>::Run() {
...
@@ -108,15 +99,8 @@ void OperatorBase<Dtype>::Run() {
}
}
}
else
{
}
else
{
CLImage
*
cl_image
=
vari
->
template
GetMutable
<
framework
::
CLImage
>();
CLImage
*
cl_image
=
vari
->
template
GetMutable
<
framework
::
CLImage
>();
// cl_command_queue commandQueue =
// scope_->GetCLScpoe()->CommandQueue(); Tensor *tmp ;
// CLImageToTensor(cl_image,tmp,commandQueue);
// tmp->Resize(cl_image->dims());
if
(
cl_image
)
{
if
(
cl_image
)
{
const
float
*
output
=
cl_image
->
data
<
float
>
();
DLOG
<<
type_
<<
" output- "
<<
key
<<
"="
<<
*
cl_image
;
DLOG
<<
type_
<<
" output- "
<<
key
<<
"="
<<
cl_image
->
dims
();
// if(output)
// DLOG<<type_<<" output- "<<key<<"="<<*output;
}
}
}
}
...
...
src/ios_io/PaddleMobileCPU.h
→
src/io
/io
s_io/PaddleMobileCPU.h
浏览文件 @
b1e877e9
文件已移动
src/ios_io/PaddleMobileCPU.mm
→
src/io
/io
s_io/PaddleMobileCPU.mm
浏览文件 @
b1e877e9
文件已移动
src/ios_io/op_symbols.h
→
src/io
/io
s_io/op_symbols.h
浏览文件 @
b1e877e9
文件已移动
src/jni/PML.java
→
src/
io/
jni/PML.java
浏览文件 @
b1e877e9
文件已移动
src/jni/paddle_mobile_jni.cpp
→
src/
io/
jni/paddle_mobile_jni.cpp
浏览文件 @
b1e877e9
文件已移动
src/jni/paddle_mobile_jni.h
→
src/
io/
jni/paddle_mobile_jni.h
浏览文件 @
b1e877e9
文件已移动
src/operators/kernel/cl/batchnorm_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -49,11 +49,13 @@ bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
...
@@ -49,11 +49,13 @@ bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
framework
::
CLImage
*
new_scale
=
new
framework
::
CLImage
();
framework
::
CLImage
*
new_scale
=
new
framework
::
CLImage
();
new_scale
->
SetTensorData
(
new_scale_ptr
,
variance
->
dims
());
new_scale
->
SetTensorData
(
new_scale_ptr
,
variance
->
dims
());
new_scale
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
());
new_scale
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
framework
::
CLImage
*
new_bias
=
new
framework
::
CLImage
();
framework
::
CLImage
*
new_bias
=
new
framework
::
CLImage
();
new_bias
->
SetTensorData
(
new_bias_ptr
,
variance
->
dims
());
new_bias
->
SetTensorData
(
new_bias_ptr
,
variance
->
dims
());
new_bias
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
());
new_bias
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
param
->
SetNewScale
(
new_scale
);
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
param
->
SetNewBias
(
new_bias
);
...
...
src/operators/kernel/cl/cl_kernel/cl_common.h
浏览文件 @
b1e877e9
...
@@ -12,11 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,11 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once;
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
inline
ha
fl
4
activation
(
half4
in
inline
ha
lf
4
activation
(
half4
in
#ifdef PRELU
#ifdef PRELU
,
,
half4
prelu_alpha
half4
prelu_alpha
...
@@ -28,7 +26,7 @@ inline hafl4 activation(half4 in
...
@@ -28,7 +26,7 @@ inline hafl4 activation(half4 in
#endif
#endif
#ifdef RELU
#ifdef RELU
fmax
(
in
,
0
.
0
);
output
=
fmax
(
in
,
(
half4
)(
0
.
0
f
)
);
#endif
#endif
return
output
;
return
output
;
}
}
src/operators/kernel/cl/cl_kernel/conv_add_bn_relu_kernel.cl
浏览文件 @
b1e877e9
...
@@ -12,10 +12,328 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,10 +12,328 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
limitations
under
the
License.
*/
#
define
BIASE
#
define
BIASE
#
define
BATCH_NORM
#
define
BATCH_NORM
#
define
RELU
#
define
RELU
#
include
"conv_kernel.inc.cl"
#
undef
#
include
"cl_kernel/cl_common.h"
#
undef
#
undef
__kernel
void
conv_3x3
(
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2,
__read_only
image2d_t
input_image,
__read_only
image2d_t
filter,
#
ifdef
BIASE
__read_only
image2d_t
bias,
#
endif
#
ifdef
BATCH_NORM
__read_only
image2d_t
new_scale,
__read_only
image2d_t
new_biase,
#
endif
__write_only
image2d_t
output_image,
__private
const
int
stride,
__private
const
int
offset,
__private
const
int
input_c,
__private
const
int
dilation,
__private
const
int
input_width,/*
of
one
block
*/
__private
const
int
input_height,/*
of
one
block
*/
__private
const
int
output_width,
__private
const
int
output_height
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_nh
=
get_global_id
(
2
)
;
int2
stride_xy
;
stride_xy.x
=
stride
;
stride_xy.y
=
stride
;
int2
ouput_pos_in_one_block
;
ouput_pos_in_one_block.x
=
out_w
;
ouput_pos_in_one_block.y
=
out_nh
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
in_pos_in_one_block
;
in_pos_in_one_block.x
=
ouput_pos_in_one_block.x
*
stride
+
offset
;
in_pos_in_one_block.y
=
ouput_pos_in_one_block.y
*
stride
+
offset
;
#
ifdef
BIASE
half4
output
=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
#
else
half4
output
=
0.0f
;
#
endif
half4
input[9]
;
for
(
int
i
=
0
; i < input_c; ++i) {
int2
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block.x,
in_pos_in_one_block.y
)
;
input[0]
=
select
(
read_imageh
(
input_image,
sampler,
(
int2
)(
pos_in.x
-
dilation,
pos_in.y
-
dilation
))
,
(
half4
)(
0.0f
)
,
(
ushort4
)(
in_pos_in_one_block.x
-
dilation
<
0
|
| in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) {
int2 fuck;
fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w);
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
}
__kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
half4 output = 0.0f;
#endif
int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
int weight_x_to = out_c * 3;
half4 inputs[9];
inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height));
inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height));
inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height));
inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height));
inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3));
output.x += input.x * weight.x;
output.y += input.y * weight.y;
output.z += input.z * weight.z;
output.w += input.w * weight.w;
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos, output);
}
__kernel void conv_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
const
uint
kernelHXW
=
1
;
int2
stride_xy
=
(
int2
)(
stride,
stride
)
;
int2
ouput_pos_in_one_block
=
(
int2
)(
out_w,
out_nh
)
;
int2
in_pos_in_one_block
=
ouput_pos_in_one_block
*
stride_xy
+
(
int2
)(
offset,
offset
)
;
#
ifdef
BIASE
half4
output
=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
#
else
half4
output
=
0.0f
;
#
endif
for
(
int
i
=
0
; i < input_c; ++i) {
int2
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block.x,
in_pos_in_one_block.y
)
;
if
(
pos_in.x
>=0
&&
pos_in.y
>=
0
&&
pos_in.x
<
input_width
&&
pos_in.y
<
input_height
)
{
half4
input
=
read_imageh
(
input_image,
sampler,
pos_in
)
;
half4
weight_x
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
0
))
;
output.x
+=
dot
(
input,
weight_x
)
;
half4
weight_y
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
1
))
;
output.y
+=
dot
(
input,
weight_y
)
;
half4
weight_z
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
2
))
;
output.z
+=
dot
(
input,
weight_z
)
;
half4
weight_w
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
3
))
;
output.w
+=
dot
(
input,
weight_w
)
;
}
}
#
ifdef
BATCH_NORM
output
=
output
*
read_imageh
(
new_scale,
sampler,
(
int2
)(
out_c,
0
))
+
read_imageh
(
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
#
endif
#
ifdef
RELU
output
=
activation
(
output
)
;
#
endif
int2
output_pos
=
(
int2
)(
out_c
*
global_size_dim1
+
out_w,
out_nh
)
;
write_imageh
(
output_image,
output_pos,
output
)
;
}
src/operators/kernel/cl/cl_kernel/conv_add_kernel.cl
浏览文件 @
b1e877e9
...
@@ -12,6 +12,324 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,6 +12,324 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
BIASE
#
define
BIASE
#
include
"conv_kernel.inc.cl"
#
undef
__kernel
void
conv_3x3
(
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2,
__read_only
image2d_t
input_image,
__read_only
image2d_t
filter,
#
ifdef
BIASE
__read_only
image2d_t
bias,
#
endif
#
ifdef
BATCH_NORM
__read_only
image2d_t
new_scale,
__read_only
image2d_t
new_biase,
#
endif
__write_only
image2d_t
output_image,
__private
const
int
stride,
__private
const
int
offset,
__private
const
int
input_c,
__private
const
int
dilation,
__private
const
int
input_width,/*
of
one
block
*/
__private
const
int
input_height,/*
of
one
block
*/
__private
const
int
output_width,
__private
const
int
output_height
)
{
const
int
out_c
=
get_global_id
(
0
)
;
const
int
out_w
=
get_global_id
(
1
)
;
const
int
out_nh
=
get_global_id
(
2
)
;
int2
stride_xy
;
stride_xy.x
=
stride
;
stride_xy.y
=
stride
;
int2
ouput_pos_in_one_block
;
ouput_pos_in_one_block.x
=
out_w
;
ouput_pos_in_one_block.y
=
out_nh
;
int2
in_pos_in_one_block
;
in_pos_in_one_block.x
=
ouput_pos_in_one_block.x
*
stride
+
offset
;
in_pos_in_one_block.y
=
ouput_pos_in_one_block.y
*
stride
+
offset
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
|
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
#
ifdef
BIASE
half4
output
=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
#
else
half4
output
=
0.0f
;
#
endif
half4
input[9]
;
for
(
int
i
=
0
; i < input_c; ++i) {
int2
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block.x,
in_pos_in_one_block.y
)
;
input[0]
=
select
(
read_imageh
(
input_image,
sampler,
(
int2
)(
pos_in.x
-
dilation,
pos_in.y
-
dilation
))
,
(
half4
)(
0.0f
)
,
(
ushort4
)(
in_pos_in_one_block.x
-
dilation
<
0
|
| in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)(pos_in.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || pos_in.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height));
for (int j = 0; j < 9; ++j) {
int2 fuck;
fuck.x = i * 3 + j % 3;
fuck.y = out_c * 4 * 3 + 0 * out_c * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, fuck);
output.x += dot(input[j], weight_x);
fuck.y = out_c * 4 * 3 + 1 * out_c * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, fuck);
output.y += dot(input[j], weight_y);
fuck.y = out_c * 4 * 3 + 2 * out_c * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, fuck);
output.z += dot(input[j], weight_z);
fuck.y = out_c * 4 * 3 + 3 * out_c * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, fuck);
output.w += dot(input[j], weight_w);
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
}
__kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#else
half4 output = 0.0f;
#endif
int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
int weight_x_to = out_c * 3;
half4 inputs[9];
inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height));
inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height));
inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height));
inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height));
inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height));
inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height));
inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)(in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height));
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
half4 weight = read_imageh(filter, sampler, (int2)(weight_x_to + j % 3, j / 3));
output.x += input.x * weight.x;
output.y += input.y * weight.y;
output.z += input.z * weight.z;
output.w += input.w * weight.w;
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
#endif
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
write_imageh(output_image, output_pos, output);
}
__kernel void conv_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
#ifdef BIASE
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
const
uint
kernelHXW
=
1
;
int2
stride_xy
=
(
int2
)(
stride,
stride
)
;
int2
ouput_pos_in_one_block
=
(
int2
)(
out_w,
out_nh
)
;
int2
in_pos_in_one_block
=
ouput_pos_in_one_block
*
stride_xy
+
(
int2
)(
offset,
offset
)
;
#
ifdef
BIASE
half4
output
=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
#
else
half4
output
=
0.0f
;
#
endif
for
(
int
i
=
0
; i < input_c; ++i) {
int2
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block.x,
in_pos_in_one_block.y
)
;
if
(
pos_in.x
>=0
&&
pos_in.y
>=
0
&&
pos_in.x
<
input_width
&&
pos_in.y
<
input_height
)
{
half4
input
=
read_imageh
(
input_image,
sampler,
pos_in
)
;
half4
weight_x
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
0
))
;
output.x
+=
dot
(
input,
weight_x
)
;
half4
weight_y
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
1
))
;
output.y
+=
dot
(
input,
weight_y
)
;
half4
weight_z
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
2
))
;
output.z
+=
dot
(
input,
weight_z
)
;
half4
weight_w
=
read_imageh
(
filter,
sampler,
(
int2
)(
i,
out_c
*
4
+
3
))
;
output.w
+=
dot
(
input,
weight_w
)
;
}
}
#
ifdef
BATCH_NORM
output
=
output
*
read_imageh
(
new_scale,
sampler,
(
int2
)(
out_c,
0
))
+
read_imageh
(
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
#
endif
#
ifdef
RELU
output
=
activation
(
output
)
;
#
endif
int2
output_pos
=
(
int2
)(
out_c
*
global_size_dim1
+
out_w,
out_nh
)
;
write_imageh
(
output_image,
output_pos,
output
)
;
}
src/operators/kernel/cl/cl_kernel/conv_kernel.cl
浏览文件 @
b1e877e9
...
@@ -19,6 +19,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
...
@@ -19,6 +19,7 @@ __kernel void conv_3x3(__private const int global_size_dim0,
__private
const
int
global_size_dim2,
__private
const
int
global_size_dim2,
__read_only
image2d_t
input_image,
__read_only
image2d_t
input_image,
__read_only
image2d_t
filter,
__read_only
image2d_t
filter,
#
ifdef
BIASE
#
ifdef
BIASE
__read_only
image2d_t
bias,
__read_only
image2d_t
bias,
#
endif
#
endif
...
...
src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -29,8 +29,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
...
@@ -29,8 +29,10 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
"need equal"
);
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
());
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
param
->
Bias
()
->
InitCLImage
(
cl_helper_
.
CLContext
());
cl_helper_
.
CLCommandQueue
());
param
->
Bias
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
// const CL *mean = param->InputMean();
// const CL *mean = param->InputMean();
const
framework
::
CLImage
*
mean
=
param
->
InputMean
();
const
framework
::
CLImage
*
mean
=
param
->
InputMean
();
...
@@ -38,6 +40,11 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
...
@@ -38,6 +40,11 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
const
framework
::
CLImage
*
scale
=
param
->
InputScale
();
const
framework
::
CLImage
*
scale
=
param
->
InputScale
();
const
framework
::
CLImage
*
bias
=
param
->
InputBias
();
const
framework
::
CLImage
*
bias
=
param
->
InputBias
();
const
float
epsilon
=
param
->
Epsilon
();
const
float
epsilon
=
param
->
Epsilon
();
//
// DLOG << " climage mean: " << *mean;
// DLOG << " climage variance: " << *variance;
// DLOG << " climage scale: " << *scale;
// DLOG << " climage bias: " << *bias;
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
mean_ptr
=
mean
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
auto
variance_ptr
=
variance
->
data
<
float
>
();
...
@@ -62,12 +69,22 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
...
@@ -62,12 +69,22 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
framework
::
CLImage
*
new_scale
=
new
framework
::
CLImage
();
framework
::
CLImage
*
new_scale
=
new
framework
::
CLImage
();
new_scale
->
SetTensorData
(
new_scale_ptr
,
variance
->
dims
());
new_scale
->
SetTensorData
(
new_scale_ptr
,
variance
->
dims
());
new_scale
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
());
new_scale
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
DLOG
<<
" climage - y bias: "
<<
*
(
param
->
Bias
());
DLOG
<<
" climage - new scale: "
<<
*
new_scale
;
framework
::
CLImage
*
new_bias
=
new
framework
::
CLImage
();
framework
::
CLImage
*
new_bias
=
new
framework
::
CLImage
();
new_bias
->
SetTensorData
(
new_bias_ptr
,
variance
->
dims
());
new_bias
->
SetTensorData
(
new_bias_ptr
,
variance
->
dims
());
new_bias
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
());
new_bias
->
InitCLImage
(
this
->
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
DLOG
<<
" climage - new bias: "
<<
*
new_bias
;
DLOG
<<
" climage - filter: "
<<
*
(
param
->
Filter
());
param
->
SetNewScale
(
new_scale
);
param
->
SetNewScale
(
new_scale
);
param
->
SetNewBias
(
new_bias
);
param
->
SetNewBias
(
new_bias
);
...
@@ -113,7 +130,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
...
@@ -113,7 +130,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
auto
biase
=
param
.
Bias
()
->
GetCLImage
();
auto
biase
=
param
.
Bias
()
->
GetCLImage
();
auto
new_scale
=
param
.
NewScale
()
->
GetCLImage
();
auto
new_scale
=
param
.
NewScale
()
->
GetCLImage
();
auto
new_bias
=
param
.
NewBias
()
->
GetCLImage
();
auto
new_bias
=
param
.
NewBias
()
->
GetCLImage
();
auto
output
=
param
.
Output
();
auto
output
=
param
.
Output
()
->
GetCLImage
()
;
int
stride
=
param
.
Strides
()[
0
];
int
stride
=
param
.
Strides
()[
0
];
int
offset
=
param
.
Offset
();
int
offset
=
param
.
Offset
();
int
input_c
=
param
.
Input
()
->
CBlock
();
int
input_c
=
param
.
Input
()
->
CBlock
();
...
@@ -126,23 +143,54 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
...
@@ -126,23 +143,54 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
cl_int
status
;
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
biase
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
biase
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
new_scale
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
new_scale
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
new_bias
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_mem
),
&
new_bias
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
cl_mem
),
&
output
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
stride
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
offset
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_c
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
dilation
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
input_width
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
input_height
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
output_width
);
status
=
clSetKernelArg
(
kernel
,
15
,
sizeof
(
int
),
&
output_width
);
status
=
clSetKernelArg
(
kernel
,
16
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
16
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
status
=
status
=
...
...
src/operators/kernel/cl/conv_add_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -25,8 +25,10 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
...
@@ -25,8 +25,10 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
"need equal"
);
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
());
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
param
->
Bias
()
->
InitCLImage
(
cl_helper_
.
CLContext
());
this
->
cl_helper_
.
CLCommandQueue
());
param
->
Bias
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
...
@@ -71,27 +73,53 @@ void ConvAddKernel<GPU_CL, float>::Compute(
...
@@ -71,27 +73,53 @@ void ConvAddKernel<GPU_CL, float>::Compute(
cl_int
status
;
cl_int
status
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
w
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
nh
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_mem
),
&
input
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
status
=
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_mem
),
&
filter
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
biase
);
status
=
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
biase
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
output
);
status
=
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
output
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
stride
);
status
=
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
&
stride
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
&
offset
);
status
=
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
&
offset
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
input_c
);
status
=
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
&
input_c
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
dilation
);
status
=
clSetKernelArg
(
kernel
,
10
,
sizeof
(
int
),
&
dilation
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_width
);
status
=
clSetKernelArg
(
kernel
,
11
,
sizeof
(
int
),
&
input_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
input_height
);
status
=
clSetKernelArg
(
kernel
,
12
,
sizeof
(
int
),
&
input_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
output_width
);
status
=
clSetKernelArg
(
kernel
,
13
,
sizeof
(
int
),
&
output_width
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
14
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
status
=
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
}
}
...
...
src/operators/kernel/cl/conv_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -26,7 +26,8 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
...
@@ -26,7 +26,8 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
"need equal"
);
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
());
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
...
@@ -95,6 +96,17 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> ¶m) {
...
@@ -95,6 +96,17 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> ¶m) {
cl_int
status
;
cl_int
status
;
DLOG
<<
" begin set kernel arg "
;
DLOG
<<
" begin set kernel arg "
;
DLOG
<<
" c block "
<<
c_block
;
DLOG
<<
" w "
<<
w
;
DLOG
<<
" nh "
<<
nh
;
DLOG
<<
" stride "
<<
stride
;
DLOG
<<
" offset "
<<
offset
;
DLOG
<<
" input_c "
<<
input_c
;
DLOG
<<
" dilation "
<<
dilation
;
DLOG
<<
" input width "
<<
input_width
;
DLOG
<<
" input height "
<<
input_height
;
DLOG
<<
" output width "
<<
output_width
;
DLOG
<<
" output height "
<<
output_height
;
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
...
...
src/operators/kernel/cl/depthwise_conv_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -27,7 +27,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
...
@@ -27,7 +27,8 @@ bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Filter
()
->
dims
()[
2
]
==
param
->
Filter
()
->
dims
()[
3
]
&&
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
param
->
Paddings
()[
0
]
==
param
->
Paddings
()[
1
],
"need equal"
);
"need equal"
);
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
());
param
->
Filter
()
->
InitCLImage
(
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
int
offset
=
static_cast
<
int
>
(
param
->
Filter
()
->
dims
()[
2
])
/
2
-
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
static_cast
<
int
>
(
param
->
Paddings
()[
1
]);
param
->
SetOffset
(
offset
);
param
->
SetOffset
(
offset
);
...
...
src/operators/kernel/cl/elementwise_add_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -22,16 +22,16 @@ namespace operators {
...
@@ -22,16 +22,16 @@ namespace operators {
template
<
>
template
<
>
bool
ElementwiseAddKernel
<
GPU_CL
,
float
>::
Init
(
bool
ElementwiseAddKernel
<
GPU_CL
,
float
>::
Init
(
ElementwiseAddParam
<
GPU_CL
>
*
param
)
{
ElementwiseAddParam
<
GPU_CL
>
*
param
)
{
CLImage
*
bias
=
(
CLImage
*
)
param
->
InputY
();
CLImage
*
bias
=
(
CLImage
*
)
param
->
InputY
();
bias
->
InitCLImage
(
cl_helper_
.
CLContext
());
bias
->
InitCLImage
(
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
if
(
bias
->
dims
().
size
()
==
4
)
{
if
(
bias
->
dims
().
size
()
==
4
)
{
this
->
cl_helper_
.
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
this
->
cl_helper_
.
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
}
else
if
(
param
->
InputY
()
->
dims
().
size
()
==
1
)
{
}
else
if
(
param
->
InputY
()
->
dims
().
size
()
==
1
)
{
DLOG
<<
"-----init add-----"
;
DLOG
<<
"-----init add-----"
;
this
->
cl_helper_
.
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
this
->
cl_helper_
.
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
}
else
{
}
else
{
DLOG
<<
"error:bias dims is error"
;
DLOG
<<
"error:bias dims is error"
;
}
}
return
true
;
return
true
;
}
}
...
@@ -44,7 +44,7 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
...
@@ -44,7 +44,7 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
auto
output
=
param
.
Out
();
auto
output
=
param
.
Out
();
cl_int
status
;
cl_int
status
;
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
if
(
bias
->
dims
().
size
()
==
4
)
{
if
(
bias
->
dims
().
size
()
==
4
)
{
cl_mem
input_image
=
input
->
GetCLImage
();
cl_mem
input_image
=
input
->
GetCLImage
();
cl_mem
bias_image
=
bias
->
GetCLImage
();
cl_mem
bias_image
=
bias
->
GetCLImage
();
cl_mem
output_image
=
output
->
GetCLImage
();
cl_mem
output_image
=
output
->
GetCLImage
();
...
@@ -57,14 +57,15 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
...
@@ -57,14 +57,15 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
int
width
=
input
->
ImageWidth
();
int
width
=
input
->
ImageWidth
();
int
height
=
input
->
ImageHeight
();
int
height
=
input
->
ImageHeight
();
size_t
global_work_size
[
2
]
=
{
width
,
height
};
size_t
global_work_size
[
2
]
=
{
width
,
height
};
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
status
=
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
}
else
if
(
bias
->
dims
().
size
()
==
1
)
{
}
else
if
(
bias
->
dims
().
size
()
==
1
)
{
cl_mem
input_image
=
input
->
GetCLImage
();
cl_mem
input_image
=
input
->
GetCLImage
();
cl_mem
bias_image
=
bias
->
GetCLImage
();
cl_mem
bias_image
=
bias
->
GetCLImage
();
cl_mem
output_image
=
output
->
GetCLImage
();
cl_mem
output_image
=
output
->
GetCLImage
();
int
tensor_w
=
input
->
dims
()[
4
];
int
tensor_w
=
input
->
dims
()[
3
];
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
input_image
);
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
(
void
*
)
&
input_image
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
bias_image
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
(
void
*
)
&
bias_image
);
...
@@ -76,13 +77,13 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
...
@@ -76,13 +77,13 @@ void ElementwiseAddKernel<GPU_CL, float>::Compute(
int
width
=
input
->
ImageWidth
();
int
width
=
input
->
ImageWidth
();
int
height
=
input
->
ImageHeight
();
int
height
=
input
->
ImageHeight
();
size_t
global_work_size
[
2
]
=
{
width
,
height
};
size_t
global_work_size
[
2
]
=
{
width
,
height
};
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
status
=
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
}
else
{
}
else
{
DLOG
<<
"error:bias dims is error"
;
DLOG
<<
"error:bias dims is error"
;
}
}
}
}
template
class
ElementwiseAddKernel
<
GPU_CL
,
float
>;
template
class
ElementwiseAddKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/feed_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -30,12 +30,14 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
...
@@ -30,12 +30,14 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
cl_int
status
;
cl_int
status
;
auto
output
=
param
.
Out
();
auto
output
=
param
.
Out
();
const
Tensor
*
input
=
param
.
InputX
();
const
Tensor
*
input
=
param
.
InputX
();
DLOG
<<
*
input
;
const
float
*
input_data
=
input
->
data
<
float
>
();
const
float
*
input_data
=
input
->
data
<
float
>
();
int
numel
=
input
->
numel
();
int
numel
=
input
->
numel
();
cl_mem
cl_image
=
output
->
GetCLImage
();
cl_mem
cl_image
=
output
->
GetCLImage
();
int
height
=
output
->
dims
()[
2
];
int
height
=
output
->
dims
()[
2
];
int
width
=
output
->
dims
()[
3
];
int
width
=
output
->
dims
()[
3
];
CLTensor
input_cl_tensor
(
this
->
cl_helper_
.
CLContext
());
CLTensor
input_cl_tensor
(
this
->
cl_helper_
.
CLContext
(),
this
->
cl_helper_
.
CLCommandQueue
());
input_cl_tensor
.
Resize
(
input
->
dims
());
input_cl_tensor
.
Resize
(
input
->
dims
());
cl_mem
inputBuffer
=
cl_mem
inputBuffer
=
input_cl_tensor
.
mutable_with_data
<
cl_mem
>
((
void
*
)
input_data
);
input_cl_tensor
.
mutable_with_data
<
cl_mem
>
((
void
*
)
input_data
);
...
@@ -53,14 +55,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
...
@@ -53,14 +55,6 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> ¶m) {
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
CL_CHECK_ERRORS
(
status
);
int
len
=
4
*
224
*
224
;
half
*
out
=
new
half
[
len
];
cl_command_queue
commandQueue
=
this
->
cl_helper_
.
CLCommandQueue
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
height
,
width
,
1
};
clEnqueueReadImage
(
commandQueue
,
cl_image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
out
,
0
,
NULL
,
NULL
);
}
}
template
class
FeedKernel
<
GPU_CL
,
float
>;
template
class
FeedKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/fetch_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -19,44 +19,45 @@ namespace operators {
...
@@ -19,44 +19,45 @@ namespace operators {
template
<
>
template
<
>
bool
FetchKernel
<
GPU_CL
,
float
>::
Init
(
FetchParam
<
GPU_CL
>
*
param
)
{
bool
FetchKernel
<
GPU_CL
,
float
>::
Init
(
FetchParam
<
GPU_CL
>
*
param
)
{
this
->
cl_helper_
.
AddKernel
(
"fetch"
,
"fetch_kernel.cl"
);
//
this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
return
true
;
return
true
;
}
}
template
<
>
template
<
>
void
FetchKernel
<
GPU_CL
,
float
>::
Compute
(
const
FetchParam
<
GPU_CL
>
&
param
)
{
void
FetchKernel
<
GPU_CL
,
float
>::
Compute
(
const
FetchParam
<
GPU_CL
>
&
param
)
{
auto
kernel
=
this
->
cl_helper_
.
KernelAt
(
0
);
// auto kernel = this->cl_helper_.KernelAt(0);
auto
default_work_size
=
this
->
cl_helper_
.
DefaultWorkSize
(
*
param
.
InputX
());
// auto default_work_size =
// this->cl_helper_.DefaultWorkSize(*param.InputX());
auto
input
=
param
.
InputX
()
->
GetCLImage
();
//
auto
*
out
=
param
.
Out
();
// auto input = param.InputX()->GetCLImage();
// auto *out = param.Out();
const
auto
&
dims
=
param
.
InputX
()
->
dims
();
//
const
int
N
=
dims
[
0
];
// const auto &dims = param.InputX()->dims();
const
int
C
=
dims
[
1
];
// const int N = dims[0];
const
int
in_height
=
dims
[
2
];
// const int C = dims[1];
const
int
in_width
=
dims
[
3
];
// const int in_height = dims[2];
// const int in_width = dims[3];
int
size_ch
=
in_height
*
in_width
;
//
int
size_block
=
size_ch
*
4
;
// int size_ch = in_height * in_width;
int
size_batch
=
size_ch
*
C
;
// int size_block = size_ch * 4;
// int size_batch = size_ch * C;
// need create outputBuffer
//
cl_image_format
imageFormat
;
// // need create outputBuffer
imageFormat
.
image_channel_order
=
CL_RGBA
;
// cl_image_format imageFormat;
imageFormat
.
image_channel_data_type
=
CL_FLOAT
;
// imageFormat.image_channel_order = CL_RGBA;
cl_mem
outputBuffer
;
// imageFormat.image_channel_data_type = CL_FLOAT;
// cl_mem outputBuffer;
clSetKernelArg
(
kernel
,
0
,
sizeof
(
int
),
&
in_height
);
//
clSetKernelArg
(
kernel
,
1
,
sizeof
(
int
),
&
in_width
);
// clSetKernelArg(kernel, 0, sizeof(int), &in_height);
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
size_ch
);
// clSetKernelArg(kernel, 1, sizeof(int), &in_width);
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
&
size_block
);
// clSetKernelArg(kernel, 2, sizeof(int), &size_ch);
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
size_batch
);
// clSetKernelArg(kernel, 3, sizeof(int), &size_block);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_mem
),
&
input
);
// clSetKernelArg(kernel, 4, sizeof(int), &size_batch);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_mem
),
&
outputBuffer
);
// clSetKernelArg(kernel, 5, sizeof(cl_mem), &input);
// clSetKernelArg(kernel, 6, sizeof(cl_mem), &outputBuffer);
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
//
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
// clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
// default_work_size.data(), NULL, 0, NULL, NULL);
}
}
template
class
FetchKernel
<
GPU_CL
,
float
>;
template
class
FetchKernel
<
GPU_CL
,
float
>;
...
...
src/operators/kernel/cl/reshape_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -37,19 +37,19 @@ void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> ¶m) {
...
@@ -37,19 +37,19 @@ void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> ¶m) {
int
dims
[
4
]
=
{
1
,
1
,
1
,
1
};
int
dims
[
4
]
=
{
1
,
1
,
1
,
1
};
int
odims
[
4
]
=
{
1
,
1
,
1
,
1
};
int
odims
[
4
]
=
{
1
,
1
,
1
,
1
};
for
(
int
i
=
0
;
i
<
inputDim
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
inputDim
.
size
();
i
++
)
{
dims
[
4
-
inputDim
.
size
()
+
i
]
=
inputDim
[
i
];
dims
[
4
-
inputDim
.
size
()
+
i
]
=
inputDim
[
i
];
}
}
for
(
int
i
=
0
;
i
<
outputDim
.
size
();
i
++
)
{
for
(
int
i
=
0
;
i
<
outputDim
.
size
();
i
++
)
{
odims
[
4
-
outputDim
.
size
()
+
i
]
=
outputDim
[
i
];
odims
[
4
-
outputDim
.
size
()
+
i
]
=
outputDim
[
i
];
}
}
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
dims
);
clSetKernelArg
(
kernel
,
2
,
sizeof
(
cl_int
),
&
dims
);
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
dims
+
1
);
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_int
),
&
dims
[
1
]
);
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
dims
+
2
);
clSetKernelArg
(
kernel
,
4
,
sizeof
(
cl_int
),
&
dims
[
2
]
);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
dims
+
3
);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
cl_int
),
&
dims
[
3
]
);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
int
),
odims
);
clSetKernelArg
(
kernel
,
6
,
sizeof
(
cl_int
),
&
odims
);
clSetKernelArg
(
kernel
,
7
,
sizeof
(
int
),
odims
+
1
);
clSetKernelArg
(
kernel
,
7
,
sizeof
(
cl_int
),
&
odims
[
1
]
);
clSetKernelArg
(
kernel
,
8
,
sizeof
(
int
),
odims
+
2
);
clSetKernelArg
(
kernel
,
8
,
sizeof
(
cl_int
),
&
odims
[
1
]
);
clSetKernelArg
(
kernel
,
9
,
sizeof
(
int
),
odims
+
3
);
clSetKernelArg
(
kernel
,
9
,
sizeof
(
cl_int
),
&
odims
[
1
]
);
const
size_t
work_size
[
2
]
=
{
output
->
ImageWidth
(),
output
->
ImageHeight
()};
const
size_t
work_size
[
2
]
=
{
output
->
ImageWidth
(),
output
->
ImageHeight
()};
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
...
...
src/operators/kernel/cl/softmax_kernel.cpp
浏览文件 @
b1e877e9
...
@@ -36,11 +36,14 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> ¶m) {
...
@@ -36,11 +36,14 @@ void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> ¶m) {
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
inputImage
);
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
&
inputImage
);
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
outputImage
);
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
&
outputImage
);
const
auto
&
inputDim
=
input
->
dims
();
const
auto
&
inputDim
=
input
->
dims
();
int
dims
[
4
]
=
{
inputDim
[
0
],
inputDim
[
1
],
inputDim
[
2
],
inputDim
[
3
]};
int
dims
[
4
]
=
{
1
,
1
,
1
,
1
};
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
dims
);
for
(
int
i
=
0
;
i
<
inputDim
.
size
();
i
++
)
{
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
dims
+
1
);
dims
[
4
-
inputDim
.
size
()
+
i
]
=
inputDim
[
i
];
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
dims
+
2
);
}
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
dims
+
3
);
clSetKernelArg
(
kernel
,
2
,
sizeof
(
int
),
&
dims
);
clSetKernelArg
(
kernel
,
3
,
sizeof
(
int
),
&
dims
[
1
]);
clSetKernelArg
(
kernel
,
4
,
sizeof
(
int
),
&
dims
[
2
]);
clSetKernelArg
(
kernel
,
5
,
sizeof
(
int
),
&
dims
[
3
]);
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
3
,
NULL
,
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
default_work_size
.
data
(),
NULL
,
0
,
NULL
,
NULL
);
...
...
test/net/test_mobilenet_GPU.cpp
浏览文件 @
b1e877e9
...
@@ -23,7 +23,7 @@ int main() {
...
@@ -23,7 +23,7 @@ int main() {
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
// std::string(g_mobilenet_detect) + "/params", true);
// std::string(g_mobilenet_detect) + "/params", true);
auto
isok
=
paddle_mobile
.
Load
(
g_mobilenet
,
fals
e
);
auto
isok
=
paddle_mobile
.
Load
(
g_mobilenet
,
tru
e
);
if
(
isok
)
{
if
(
isok
)
{
auto
time2
=
paddle_mobile
::
time
();
auto
time2
=
paddle_mobile
::
time
();
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time1
)
<<
"ms"
std
::
cout
<<
"load cost :"
<<
paddle_mobile
::
time_diff
(
time1
,
time1
)
<<
"ms"
...
...
tools/pre-commit.hooks/clang-format.hook
浏览文件 @
b1e877e9
...
@@ -17,7 +17,7 @@ shift
...
@@ -17,7 +17,7 @@ shift
perl
-i
-pe
's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|'
"
$@
"
perl
-i
-pe
's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|'
"
$@
"
(
(
# remove clang format ios_io folder
# remove clang format ios_io folder
flist
=
$(
echo
"
$@
"
| perl
-pe
's|src/ios_io/[^ ]*||'
)
flist
=
$(
echo
"
$@
"
| perl
-pe
's|src/io
/io
s_io/[^ ]*||'
)
clang-format
-i
$flist
clang-format
-i
$flist
)
)
perl
-i
-pe
's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||'
"
$@
"
perl
-i
-pe
's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||'
"
$@
"
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录