Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
fc3a66ae
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
fc3a66ae
编写于
6月 25, 2019
作者:
C
Chunwei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'fix_opencl_acc' into 'incubate/lite'
fix elementwise_add acc bugs. See merge request inference/paddlelite!58
上级
679aabac
19bea13c
变更
17
展开全部
隐藏空白更改
内联
并排
Showing
17 changed file
with
210 addition
and
633 deletion
+210
-633
paddle/fluid/lite/api/mobilenetv1_test.cc
paddle/fluid/lite/api/mobilenetv1_test.cc
+1
-1
paddle/fluid/lite/core/context.h
paddle/fluid/lite/core/context.h
+3
-0
paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
+2
-2
paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
...fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
+4
-4
paddle/fluid/lite/opencl/CMakeLists.txt
paddle/fluid/lite/opencl/CMakeLists.txt
+2
-4
paddle/fluid/lite/opencl/cl_caller.cc
paddle/fluid/lite/opencl/cl_caller.cc
+24
-14
paddle/fluid/lite/opencl/cl_caller.h
paddle/fluid/lite/opencl/cl_caller.h
+2
-2
paddle/fluid/lite/opencl/cl_engine.cc
paddle/fluid/lite/opencl/cl_engine.cc
+1
-2
paddle/fluid/lite/opencl/cl_half.cc
paddle/fluid/lite/opencl/cl_half.cc
+0
-518
paddle/fluid/lite/opencl/cl_image.cc
paddle/fluid/lite/opencl/cl_image.cc
+4
-5
paddle/fluid/lite/opencl/cl_image_converter.cc
paddle/fluid/lite/opencl/cl_image_converter.cc
+20
-21
paddle/fluid/lite/opencl/cl_image_converter.h
paddle/fluid/lite/opencl/cl_image_converter.h
+14
-15
paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl
paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl
+29
-0
paddle/fluid/lite/opencl/cl_kernel/cl_common.h
paddle/fluid/lite/opencl/cl_kernel/cl_common.h
+7
-9
paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
+5
-6
paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
+7
-8
paddle/fluid/lite/opencl/cl_test.cc
paddle/fluid/lite/opencl/cl_test.cc
+85
-22
未找到文件。
paddle/fluid/lite/api/mobilenetv1_test.cc
浏览文件 @
fc3a66ae
...
...
@@ -61,7 +61,7 @@ void TestModel(const std::vector<Place>& valid_places,
3.13812525e-05
,
6.52209565e-05
,
4.78087313e-05
,
2.58822285e-04
});
for
(
int
i
=
0
;
i
<
results
.
size
();
++
i
)
{
EXPECT_NEAR
(
out
->
data
<
float
>
()[
i
],
results
[
i
],
1e-
5
);
EXPECT_NEAR
(
out
->
data
<
float
>
()[
i
],
results
[
i
],
1e-
6
);
}
ASSERT_EQ
(
out
->
dims
().
size
(),
2
);
ASSERT_EQ
(
out
->
dims
()[
0
],
1
);
...
...
paddle/fluid/lite/core/context.h
浏览文件 @
fc3a66ae
...
...
@@ -236,12 +236,15 @@ class Context<TargetType::kOpenCL> {
void
CopySharedTo
(
const
OpenClContext
*
ctx
)
{
ctx
->
cl_context_
=
cl_context_
;
ctx
->
cl_helper_
=
cl_helper_
;
}
private:
void
PrepareKernels
()
{
cl_helper_
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
cl_helper_
->
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
cl_helper_
->
AddKernel
(
"pool_max"
,
"pool_kernel.cl"
);
cl_helper_
->
AddKernel
(
"pool_avg"
,
"pool_kernel.cl"
);
}
};
#endif
...
...
paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
浏览文件 @
fc3a66ae
...
...
@@ -31,10 +31,10 @@ class ElementwiseAddCompute
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
auto
&
context
=
ctx_
->
As
<
OpenClContext
>
();
CHECK
(
context
.
cl_
context
()
);
CHECK
(
context
.
cl_
helper
()
!=
nullptr
);
elementwise_add
(
context
.
cl_
context
(),
static_cast
<
const
float
*>
(
param
.
X
->
raw_data
()),
context
.
cl_
helper
(),
static_cast
<
const
float
*>
(
param
.
X
->
raw_data
()),
param
.
X
->
dims
(),
static_cast
<
const
float
*>
(
param
.
Y
->
raw_data
()),
param
.
Y
->
dims
(),
param
.
Out
->
mutable_data
<
float
>
(),
param
.
Out
->
dims
());
}
...
...
paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
浏览文件 @
fc3a66ae
...
...
@@ -40,9 +40,9 @@ TEST(elementwise_add, init) {
kernel
->
SetParam
(
param
);
kernel
->
SetContext
(
std
::
move
(
context
));
X
.
Resize
({
1
,
10
});
Y
.
Resize
({
1
,
10
});
Out
.
Resize
({
1
,
10
});
X
.
Resize
({
1
,
1
,
1
,
1
0
});
Y
.
Resize
({
1
,
1
,
1
,
1
0
});
Out
.
Resize
({
1
,
1
,
1
,
1
0
});
auto
*
x_data
=
X
.
mutable_data
<
float
>
();
auto
*
y_data
=
Y
.
mutable_data
<
float
>
();
...
...
@@ -56,7 +56,7 @@ TEST(elementwise_add, init) {
kernel
->
Launch
();
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
EXPECT_NEAR
(
out_data
[
i
],
3.4
*
i
,
1e-
1
);
EXPECT_NEAR
(
out_data
[
i
],
3.4
*
i
,
1e-
6
);
}
}
...
...
paddle/fluid/lite/opencl/CMakeLists.txt
浏览文件 @
fc3a66ae
...
...
@@ -5,13 +5,11 @@ endif()
cc_library
(
cl_wrapper SRCS cl_wrapper.cc
)
cc_library
(
cl_tool SRCS cl_tool.cc
)
target_compile_options
(
cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers
)
cc_library
(
cl_half SRCS cl_half.cc
)
target_compile_options
(
cl_half BEFORE PUBLIC -fno-strict-aliasing
)
cc_library
(
cl_engine SRCS cl_engine.cc DEPS cl_tool
)
cc_library
(
cl_context SRCS cl_context.cc DEPS cl_engine
)
cc_library
(
cl_helper SRCS cl_helper.cc DEPS cl_context
)
cc_library
(
cl_image_converter SRCS cl_image_converter.cc DEPS
cl_half
lite_tensor
)
cc_library
(
cl_image SRCS cl_image.cc DEPS
cl_half
lite_tensor cl_image_converter cl_engine
)
cc_library
(
cl_image_converter SRCS cl_image_converter.cc DEPS lite_tensor
)
cc_library
(
cl_image SRCS cl_image.cc DEPS lite_tensor cl_image_converter cl_engine
)
cc_library
(
cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image
)
lite_cc_test
(
test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper
)
add_dependencies
(
cl_tool opencl_clhpp
)
paddle/fluid/lite/opencl/cl_caller.cc
浏览文件 @
fc3a66ae
...
...
@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_caller.h"
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_helper.h"
#include "paddle/fluid/lite/opencl/cl_image.h"
...
...
@@ -23,16 +22,17 @@ limitations under the License. */
namespace
paddle
{
namespace
lite
{
static
void
CopyImageData
(
const
CLImage
&
cl_image
,
float
*
out
)
{
static
void
CopyImageData
(
CLHelper
*
helper
,
const
CLImage
&
cl_image
,
float
*
out
)
{
int
width
=
cl_image
.
image_dims
()[
0
];
int
height
=
cl_image
.
image_dims
()[
1
];
half_t
*
image_data
=
new
half_
t
[
height
*
width
*
4
];
float
*
image_data
=
new
floa
t
[
height
*
width
*
4
];
cl
::
Image
*
image
=
cl_image
.
cl_image
();
const
std
::
array
<
size_t
,
3
>
origin
{
0
,
0
,
0
};
const
std
::
array
<
size_t
,
3
>
region
{
static_cast
<
size_t
>
(
width
),
static_cast
<
size_t
>
(
height
),
1
};
cl_int
err
=
CLEngine
::
Global
()
->
command_q
ueue
().
enqueueReadImage
(
cl_int
err
=
helper
->
OpenCLCommandQ
ueue
().
enqueueReadImage
(
*
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
image_data
,
nullptr
,
nullptr
);
CL_CHECK_ERRORS
(
err
);
...
...
@@ -49,22 +49,25 @@ bool InitOpenCLEngine(std::string cl_path) {
return
engine
->
IsInitSuccess
();
}
void
elementwise_add
(
CL
Context
*
context
,
const
float
*
in
,
const
DDim
&
in_dim
,
void
elementwise_add
(
CL
Helper
*
helper
,
const
float
*
in
,
const
DDim
&
in_dim
,
const
float
*
bias
,
const
DDim
&
bias_dim
,
float
*
out
,
const
DDim
&
out_dim
)
{
CLHelper
helper
(
context
);
helper
.
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
auto
kernel
=
helper
.
GetKernel
(
0
);
if
(
!
(
bias_dim
.
size
()
==
1
||
bias_dim
.
size
()
==
4
))
{
LOG
(
FATAL
)
<<
"Error: bias dims is error"
;
return
;
}
auto
kernel
=
bias_dim
.
size
()
==
1
?
helper
->
GetKernel
(
"channel_add"
)
:
helper
->
GetKernel
(
"elementwise_add"
);
CLImage
in_image
;
in_image
.
set_tensor_data
(
in
,
in_dim
);
in_image
.
InitNormalCLImage
(
helper
.
OpenCLContext
());
in_image
.
InitNormalCLImage
(
helper
->
OpenCLContext
());
VLOG
(
3
)
<<
" --- Inpu image: "
<<
in_image
<<
" --- "
;
CLImage
bias_image
;
bias_image
.
set_tensor_data
(
bias
,
bias_dim
);
bias_image
.
Init
NormalCLImage
(
helper
.
OpenCLContext
());
bias_image
.
Init
CLImage
(
helper
->
OpenCLContext
());
VLOG
(
3
)
<<
" --- Bias image: "
<<
bias_image
<<
" --- "
;
CLImage
out_image
;
out_image
.
InitEmptyImage
(
helper
.
OpenCLContext
(),
out_dim
);
out_image
.
InitEmptyImage
(
helper
->
OpenCLContext
(),
out_dim
);
cl_int
status
;
status
=
kernel
.
setArg
(
0
,
*
in_image
.
cl_image
());
CL_CHECK_ERRORS
(
status
);
...
...
@@ -72,16 +75,23 @@ void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
CL_CHECK_ERRORS
(
status
);
status
=
kernel
.
setArg
(
2
,
*
out_image
.
cl_image
());
CL_CHECK_ERRORS
(
status
);
if
(
bias_dim
.
size
()
==
1
)
{
int
tensor_w
=
in_dim
[
3
];
status
=
kernel
.
setArg
(
3
,
tensor_w
);
CL_CHECK_ERRORS
(
status
);
}
size_t
width
=
in_image
.
ImageWidth
();
size_t
height
=
in_image
.
ImageHeight
();
auto
global_work_size
=
cl
::
NDRange
{
width
,
height
};
status
=
helper
.
OpenCLCommandQueue
().
enqueueNDRangeKernel
(
status
=
helper
->
OpenCLCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
nullptr
);
CL_CHECK_ERRORS
(
status
);
status
=
helper
->
OpenCLCommandQueue
().
finish
();
CL_CHECK_ERRORS
(
status
);
VLOG
(
3
)
<<
" --- Out image: "
<<
out_image
<<
" --- "
;
CopyImageData
(
out_image
,
out
);
CopyImageData
(
helper
,
out_image
,
out
);
}
}
// namespace lite
...
...
paddle/fluid/lite/opencl/cl_caller.h
浏览文件 @
fc3a66ae
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_
context
.h"
#include "paddle/fluid/lite/opencl/cl_
helper
.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -27,7 +27,7 @@ bool InitOpenCLEngine(std::string cl_path);
/// black box so that the framework can remain simple.
/// NOTE Currently, these methods are quite expensive, we will optimize them
/// latter.
void
elementwise_add
(
CL
Context
*
context
,
const
float
*
in
,
const
DDim
&
in_dim
,
void
elementwise_add
(
CL
Helper
*
helper
,
const
float
*
in
,
const
DDim
&
in_dim
,
const
float
*
bias
,
const
DDim
&
bias_dim
,
float
*
out
,
const
DDim
&
out_dim
);
...
...
paddle/fluid/lite/opencl/cl_engine.cc
浏览文件 @
fc3a66ae
...
...
@@ -156,8 +156,7 @@ bool CLEngine::InitializeDevice() {
if
(
ext_data
.
find
(
"cl_khr_fp16"
)
!=
std
::
string
::
npos
)
{
LOG
(
INFO
)
<<
"The chosen device supports the half data type."
;
}
else
{
LOG
(
ERROR
)
<<
"The chosen device doesn't support the half data type!"
;
return
false
;
LOG
(
INFO
)
<<
"The chosen device doesn't support the half data type!"
;
}
auto
max_units
=
device_
->
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
LOG
(
INFO
)
<<
"The chosen device has "
<<
max_units
<<
" compute units."
;
...
...
paddle/fluid/lite/opencl/cl_half.cc
已删除
100644 → 0
浏览文件 @
679aabac
此差异已折叠。
点击以展开。
paddle/fluid/lite/opencl/cl_image.cc
浏览文件 @
fc3a66ae
...
...
@@ -16,7 +16,6 @@ limitations under the License. */
#include <glog/logging.h>
#include <array>
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace
paddle
{
...
...
@@ -26,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int
width
=
cl_image
.
image_dims_
[
0
];
int
height
=
cl_image
.
image_dims_
[
1
];
half_t
*
image_data
=
new
half_
t
[
height
*
width
*
4
];
float
*
image_data
=
new
floa
t
[
height
*
width
*
4
];
cl
::
Image
*
image
=
cl_image
.
cl_image
();
const
std
::
array
<
size_t
,
3
>
origin
{
0
,
0
,
0
};
const
std
::
array
<
size_t
,
3
>
region
{
static_cast
<
size_t
>
(
width
),
...
...
@@ -131,9 +130,9 @@ void CLImage::InitCLImage(const cl::Context& context,
image_dims_
=
converter
->
InitImageDimInfoWith
(
tensor_dims_
);
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
half_t
*
image_data
=
new
half_
t
[
image_dims_
.
product
()
*
4
];
float
*
image_data
=
new
floa
t
[
image_dims_
.
product
()
*
4
];
#else
half_t
*
image_data
=
new
half_
t
[
image_dims_
.
production
()
*
4
];
float
*
image_data
=
new
floa
t
[
image_dims_
.
production
()
*
4
];
#endif
VLOG
(
3
)
<<
" convert to image "
;
...
...
@@ -151,7 +150,7 @@ void CLImage::InitCLImage(const cl::Context& context,
void
CLImage
::
InitCLImage
(
const
cl
::
Context
&
context
,
int
width
,
int
height
,
void
*
data
)
{
cl
::
ImageFormat
img_format
(
CL_RGBA
,
CL_
HALF_
FLOAT
);
cl
::
ImageFormat
img_format
(
CL_RGBA
,
CL_FLOAT
);
cl_int
err
;
cl_image_
.
reset
(
new
cl
::
Image2D
(
context
,
CL_MEM_READ_WRITE
|
(
data
?
CL_MEM_COPY_HOST_PTR
:
0
),
...
...
paddle/fluid/lite/opencl/cl_image_converter.cc
浏览文件 @
fc3a66ae
...
...
@@ -36,7 +36,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterDefault
::
NCHWToImage
(
float
*
nchw
,
half_
t
*
image
,
void
CLImageConverterDefault
::
NCHWToImage
(
float
*
nchw
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
size_t
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
...
...
@@ -68,7 +68,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
if
(
c
<
C
)
{
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image
[
i2
]
=
Float2Half
(
*
p
)
;
image
[
i2
]
=
*
p
;
i2
+=
4
;
p
++
;
}
else
{
...
...
@@ -83,7 +83,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
}
}
void
CLImageConverterDefault
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterDefault
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
...
...
@@ -107,7 +107,7 @@ void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
size_t
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
image
[
i2
])
;
*
p
=
image
[
i2
]
;
i2
+=
4
;
p
++
;
}
...
...
@@ -161,7 +161,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
}
}
void
CLImageConverterFolder
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterFolder
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
<=
4
&&
tensor_dim
.
size
()
>
0
)
<<
" Tensor dim is not support!"
;
...
...
@@ -184,14 +184,13 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
for
(
size_t
h
=
0
;
h
<
tdim
[
0
];
h
++
)
{
for
(
size_t
w
=
0
;
w
<
tdim
[
1
];
w
++
)
{
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
Float2Half
(
tensor
[
h
*
tdim
[
1
]
+
w
]);
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
tensor
[
h
*
tdim
[
1
]
+
w
];
}
}
}
}
void
CLImageConverterFolder
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterFolder
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
if
(
tensor_dim
.
size
()
>
2
)
{
...
...
@@ -213,7 +212,7 @@ void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
h
=
0
;
h
<
H
;
h
++
)
{
for
(
size_t
w
=
0
;
w
<
W
;
w
++
)
{
p
[
h
*
W
+
w
]
=
Half2Float
(
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)])
;
p
[
h
*
W
+
w
]
=
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
;
}
}
}
...
...
@@ -233,7 +232,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterNWBlock
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterNWBlock
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
==
4
)
<<
" Tensor dim is not 4."
;
auto
image_dim
=
InitImageDimInfoWith
(
tensor_dim
);
...
...
@@ -253,7 +252,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
size_t
index
=
4
*
c
*
(
width
*
H
)
+
4
*
h
*
width
+
4
*
W
*
(
n
/
4
)
+
w
*
4
+
n
%
4
;
if
(
n
<
N
)
{
image
[
index
]
=
Float2Half
(
*
p
)
;
image
[
index
]
=
*
p
;
p
++
;
}
else
{
image
[
index
]
=
0.0
;
...
...
@@ -268,7 +267,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
VLOG
(
3
)
<<
" init done"
;
}
void
CLImageConverterNWBlock
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterNWBlock
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
==
4
)
<<
" Tensor dim is not 4."
;
...
...
@@ -286,7 +285,7 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
w
=
0
;
w
<
W
;
++
w
)
{
size_t
index
=
4
*
c
*
(
width
*
H
)
+
4
*
h
*
width
+
4
*
W
*
(
n
/
4
)
+
w
*
4
+
n
%
4
;
*
p
=
Half2Float
(
image
[
index
])
;
*
p
=
image
[
index
]
;
p
++
;
if
(
index
>=
(
width
*
height
*
4
))
{
LOG
(
INFO
)
<<
" index out of range "
;
...
...
@@ -312,7 +311,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterDWBlock
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterDWBlock
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
size_t
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
...
...
@@ -344,7 +343,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
if
(
c
<
C
)
{
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image
[
i2
]
=
Float2Half
(
*
p
)
;
image
[
i2
]
=
*
p
;
i2
+=
4
;
p
++
;
}
else
{
...
...
@@ -359,7 +358,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
}
}
void
CLImageConverterDWBlock
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterDWBlock
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
==
4
)
<<
" Tensor dim is not 4."
;
...
...
@@ -377,7 +376,7 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
size_t
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
image
[
i2
])
;
*
p
=
image
[
i2
]
;
i2
+=
4
;
p
++
;
}
...
...
@@ -410,7 +409,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterNormal
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterNormal
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
<=
4
&&
tensor_dim
.
size
()
>
0
)
<<
" Tensor dim is not support!"
;
...
...
@@ -419,7 +418,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
default_converter
.
NCHWToImage
(
tensor
,
image
,
tensor_dim
);
}
void
CLImageConverterNormal
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterNormal
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
CLImageConverterDefault
default_converter
;
...
...
@@ -439,10 +438,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterWinoTransWeight
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterWinoTransWeight
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{}
void
CLImageConverterWinoTransWeight
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterWinoTransWeight
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{}
...
...
paddle/fluid/lite/opencl/cl_image_converter.h
浏览文件 @
fc3a66ae
...
...
@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -24,10 +23,10 @@ class CLImageConverterBase {
public:
virtual
~
CLImageConverterBase
()
{}
virtual
void
NCHWToImage
(
float
*
nchw
,
half_
t
*
image
,
virtual
void
NCHWToImage
(
float
*
nchw
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
=
0
;
virtual
void
ImageToNCHW
(
half_
t
*
image
,
float
*
nchw
,
const
DDim
&
image_dim
,
virtual
void
ImageToNCHW
(
floa
t
*
image
,
float
*
nchw
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
=
0
;
virtual
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
)
=
0
;
};
...
...
@@ -35,16 +34,16 @@ class CLImageConverterBase {
class
CLImageConverterDefault
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
nchw
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
nchw
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
class
CLImageConverterFolder
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
/*
...
...
@@ -68,8 +67,8 @@ class CLImageConverterFolder : public CLImageConverterBase {
class
CLImageConverterNormal
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
/*
...
...
@@ -92,22 +91,22 @@ class CLImageConverterNormal : public CLImageConverterBase {
class
CLImageConverterNWBlock
:
public
CLImageConverterBase
{
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
class
CLImageConverterDWBlock
:
public
CLImageConverterBase
{
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
class
CLImageConverterWinoTransWeight
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
...
...
paddle/fluid/lite/opencl/cl_
half.h
→
paddle/fluid/lite/opencl/cl_
kernel/channel_add_kernel.cl
浏览文件 @
fc3a66ae
...
...
@@ -12,21 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#pragma once
#include <cstdint>
namespace
paddle
{
namespace
lite
{
typedef
uint16_t
half_t
;
half_t
Float2Half
(
float
f
);
float
Half2Float
(
half_t
h
);
void
FloatArray2HalfArray
(
float
*
f_array
,
half_t
*
h_array
,
int
count
);
void
HalfArray2FloatArray
(
half_t
*
h_array
,
float
*
f_array
,
int
count
);
}
// namespace lite
}
// namespace paddle
__kernel
void
channel_add
(
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
outputImage,
__private
const
int
w
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
int2
coords_bias
;
coords_bias.x
=
x/w
;
coords_bias.y
=
0
;
float4
in
=
read_imagef
(
input,
sampler,
coords
)
;
float4
biase
=
read_imagef
(
bias,
sampler,
coords_bias
)
;
float4
output
=
in
+
biase
;
write_imagef
(
outputImage,
coords,
output
)
;
}
paddle/fluid/lite/opencl/cl_kernel/cl_common.h
浏览文件 @
fc3a66ae
...
...
@@ -14,21 +14,19 @@ limitations under the License. */
#pragma once
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
inline
half4
activation
(
half4
in
inline
float4
activation
(
float4
in
#ifdef PRELU
,
half
4
prelu_alpha
,
float
4
prelu_alpha
#endif
)
{
half
4
output
;
)
{
float
4
output
;
#ifdef PRELU
output
=
select
(
prelu_alpha
*
in
,
in
,
in
>=
(
half
4
)
0
.
0
);
output
=
select
(
prelu_alpha
*
in
,
in
,
in
>=
(
float
4
)
0
.
0
);
#endif
#ifdef RELU
output
=
fmax
(
in
,
(
half
4
)(
0
.
0
f
));
output
=
fmax
(
in
,
(
float
4
)(
0
.
0
f
));
#endif
return
output
;
}
paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
浏览文件 @
fc3a66ae
...
...
@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
elementwise_add
(
__global
image2d_t
input,
__global
image2d_t
bias,__write_only
image2d_t
outputImage
)
{
__kernel
void
elementwise_add
(
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
outputImage
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
half4
in
=
read_imageh
(
input,
sampler,
coords
)
;
half4
biase
=
read_imageh
(
bias,
sampler,
coords
)
;
half
4
output
=
in
+
biase
;
write_image
h
(
outputImage,coords,output
)
;
float4
in
=
read_imagef
(
input,
sampler,
coords
)
;
float4
biase
=
read_imagef
(
bias,
sampler,
coords
)
;
float
4
output
=
in
+
biase
;
write_image
f
(
outputImage,coords,output
)
;
}
paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
浏览文件 @
fc3a66ae
...
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
MIN_VALUE
-FLT_MAX
__kernel
void
pool_max
(
...
...
@@ -41,16 +40,16 @@ __kernel void pool_max(
const
int
pos_in_x
=
out_c
*
in_width
;
const
int
pos_in_y
=
out_n
*
in_height
;
half4
max_value
=
(
half
4
)(
MIN_VALUE
)
;
float4
max_value
=
(
float
4
)(
MIN_VALUE
)
;
for
(
int
y
=
start_h
; y < end_h; ++y) {
for
(
int
x
=
start_w
; x < end_w; ++x) {
half4
tmp
=
read_imageh
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
float4
tmp
=
read_imagef
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
max_value
=
max
(
max_value,
tmp
)
;
}
}
const
int
pos_out_x
=
mad24
(
out_c,
out_width,
out_w
)
;
write_image
h
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
max_value
)
;
write_image
f
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
max_value
)
;
}
__kernel
void
pool_avg
(
...
...
@@ -77,15 +76,15 @@ __kernel void pool_avg(
const
int
pos_in_x
=
out_c
*
in_width
;
const
int
pos_in_y
=
out_n
*
in_height
;
half4
sum
=
(
half
4
)(
0.0f
)
;
float4
sum
=
(
float
4
)(
0.0f
)
;
int
num
=
0
;
for
(
int
y
=
start_h
; y < end_h; ++y) {
for
(
int
x
=
start_w
; x < end_w; ++x) {
sum
+=
read_image
h
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
sum
+=
read_image
f
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
num++
;
}
}
half
4
avg
=
sum
/
num
;
float
4
avg
=
sum
/
num
;
const
int
pos_out_x
=
mad24
(
out_c,
out_width,
out_w
)
;
write_image
h
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
avg
)
;
write_image
f
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
avg
)
;
}
paddle/fluid/lite/opencl/cl_test.cc
浏览文件 @
fc3a66ae
...
...
@@ -67,28 +67,28 @@ TEST(cl_test, kernel_test) {
helper
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
auto
kernel
=
helper
->
GetKernel
(
2
);
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
1024
*
512
]);
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
++
)
{
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
4
*
3
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
3
*
256
*
512
;
i
++
)
{
in_data
[
i
]
=
1.
f
;
}
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
3
,
256
,
512
});
CLImage
in_image
;
in_image
.
set_tensor_data
(
in_data
.
get
(),
in_dim
);
in_image
.
InitNormalCLImage
(
helper
->
OpenCLContext
());
LOG
(
INFO
)
<<
in_image
;
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
1024
*
512
]);
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
++
)
{
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
4
*
3
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
3
*
256
*
512
;
i
++
)
{
bias_data
[
i
]
=
2.
f
;
}
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
3
,
256
,
512
});
CLImage
bias_image
;
bias_image
.
set_tensor_data
(
bias_data
.
get
(),
bias_dim
);
bias_image
.
InitNormalCLImage
(
helper
->
OpenCLContext
());
LOG
(
INFO
)
<<
bias_image
;
CLImage
out_image
;
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
3
,
256
,
512
});
out_image
.
InitEmptyImage
(
helper
->
OpenCLContext
(),
out_dim
);
LOG
(
INFO
)
<<
out_image
;
...
...
@@ -108,7 +108,8 @@ TEST(cl_test, kernel_test) {
status
=
helper
->
OpenCLCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
&
event
);
CL_CHECK_ERRORS
(
status
);
status
=
helper
->
OpenCLCommandQueue
().
finish
();
CL_CHECK_ERRORS
(
status
);
double
start_nanos
=
event
.
getProfilingInfo
<
CL_PROFILING_COMMAND_START
>
();
double
stop_nanos
=
event
.
getProfilingInfo
<
CL_PROFILING_COMMAND_END
>
();
double
elapsed_micros
=
(
stop_nanos
-
start_nanos
)
/
1000.0
;
...
...
@@ -116,37 +117,99 @@ TEST(cl_test, kernel_test) {
LOG
(
INFO
)
<<
out_image
;
}
TEST
(
cl_test
,
elementwise
_add_test
)
{
TEST
(
cl_test
,
channel
_add_test
)
{
std
::
default_random_engine
engine
;
std
::
uniform_real_distribution
<
float
>
dist
(
-
5
,
5
);
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
1024
*
512
]);
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
++
)
{
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
in_data
[
i
]
=
dist
(
engine
);
}
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1
024
,
512
});
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
1
024
*
512
]);
for
(
int
i
=
0
;
i
<
1
024
*
512
;
i
++
)
{
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1
6
});
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
1
6
]);
for
(
int
i
=
0
;
i
<
1
6
;
i
++
)
{
bias_data
[
i
]
=
dist
(
engine
);
}
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
std
::
unique_ptr
<
float
[]
>
out
(
new
float
[
1024
*
512
]);
std
::
unique_ptr
<
float
[]
>
out_ref
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
16
;
j
++
)
{
float
b
=
bias_data
[
j
];
for
(
int
k
=
0
;
k
<
256
*
512
;
k
++
)
{
int
index
=
(
i
*
16
+
j
)
*
256
*
512
+
k
;
out_ref
[
index
]
=
in_data
[
index
]
+
b
;
}
}
}
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
out
(
new
float
[
4
*
16
*
256
*
512
]);
bool
status
=
InitOpenCLEngine
(
FLAGS_cl_path
);
CHECK
(
status
)
<<
"Fail to initialize OpenCL engine."
;
CLContext
context
;
std
::
unique_ptr
<
CLContext
>
context
(
new
CLContext
);
std
::
unique_ptr
<
CLHelper
>
helper
(
new
CLHelper
(
context
.
get
()));
helper
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
helper
->
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
elementwise_add
(
helper
.
get
(),
in_data
.
get
(),
in_dim
,
bias_data
.
get
(),
bias_dim
,
out
.
get
(),
out_dim
);
int
stride
=
4
*
16
*
256
*
512
/
20
;
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
+=
stride
)
{
std
::
cout
<<
out
[
i
]
<<
" "
;
}
elementwise_add
(
&
context
,
in_data
.
get
(),
in_dim
,
bias_data
.
get
(),
bias_dim
,
out
.
get
(),
out_dim
);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
EXPECT_NEAR
(
out
[
i
],
out_ref
[
i
],
1e-6
);
}
int
stride
=
1024
*
512
/
20
;
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
+=
stride
)
{
std
::
cout
<<
std
::
endl
;
}
TEST
(
cl_test
,
elementwise_add_test
)
{
std
::
default_random_engine
engine
;
std
::
uniform_real_distribution
<
float
>
dist
(
-
5
,
5
);
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
in_data
[
i
]
=
dist
(
engine
);
}
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
bias_data
[
i
]
=
dist
(
engine
);
}
std
::
unique_ptr
<
float
[]
>
out_ref
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
out_ref
[
i
]
=
in_data
[
i
]
+
bias_data
[
i
];
}
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
out
(
new
float
[
4
*
16
*
256
*
512
]);
bool
status
=
InitOpenCLEngine
(
FLAGS_cl_path
);
CHECK
(
status
)
<<
"Fail to initialize OpenCL engine."
;
std
::
unique_ptr
<
CLContext
>
context
(
new
CLContext
);
std
::
unique_ptr
<
CLHelper
>
helper
(
new
CLHelper
(
context
.
get
()));
helper
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
helper
->
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
elementwise_add
(
helper
.
get
(),
in_data
.
get
(),
in_dim
,
bias_data
.
get
(),
bias_dim
,
out
.
get
(),
out_dim
);
int
stride
=
4
*
16
*
256
*
512
/
20
;
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
+=
stride
)
{
std
::
cout
<<
out
[
i
]
<<
" "
;
}
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
EXPECT_NEAR
(
out
[
i
],
out_ref
[
i
],
1e-6
);
}
std
::
cout
<<
std
::
endl
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录