Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
fc3a66ae
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
fc3a66ae
编写于
6月 25, 2019
作者:
C
Chunwei
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'fix_opencl_acc' into 'incubate/lite'
fix elementwise_add acc bugs. See merge request inference/paddlelite!58
上级
679aabac
19bea13c
变更
17
展开全部
显示空白变更内容
内联
并排
Showing
17 changed file
with
210 addition
and
633 deletion
+210
-633
paddle/fluid/lite/api/mobilenetv1_test.cc
paddle/fluid/lite/api/mobilenetv1_test.cc
+1
-1
paddle/fluid/lite/core/context.h
paddle/fluid/lite/core/context.h
+3
-0
paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
+2
-2
paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
...fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
+4
-4
paddle/fluid/lite/opencl/CMakeLists.txt
paddle/fluid/lite/opencl/CMakeLists.txt
+2
-4
paddle/fluid/lite/opencl/cl_caller.cc
paddle/fluid/lite/opencl/cl_caller.cc
+24
-14
paddle/fluid/lite/opencl/cl_caller.h
paddle/fluid/lite/opencl/cl_caller.h
+2
-2
paddle/fluid/lite/opencl/cl_engine.cc
paddle/fluid/lite/opencl/cl_engine.cc
+1
-2
paddle/fluid/lite/opencl/cl_half.cc
paddle/fluid/lite/opencl/cl_half.cc
+0
-518
paddle/fluid/lite/opencl/cl_image.cc
paddle/fluid/lite/opencl/cl_image.cc
+4
-5
paddle/fluid/lite/opencl/cl_image_converter.cc
paddle/fluid/lite/opencl/cl_image_converter.cc
+20
-21
paddle/fluid/lite/opencl/cl_image_converter.h
paddle/fluid/lite/opencl/cl_image_converter.h
+14
-15
paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl
paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl
+29
-0
paddle/fluid/lite/opencl/cl_kernel/cl_common.h
paddle/fluid/lite/opencl/cl_kernel/cl_common.h
+7
-9
paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
+5
-6
paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
+7
-8
paddle/fluid/lite/opencl/cl_test.cc
paddle/fluid/lite/opencl/cl_test.cc
+85
-22
未找到文件。
paddle/fluid/lite/api/mobilenetv1_test.cc
浏览文件 @
fc3a66ae
...
...
@@ -61,7 +61,7 @@ void TestModel(const std::vector<Place>& valid_places,
3.13812525e-05
,
6.52209565e-05
,
4.78087313e-05
,
2.58822285e-04
});
for
(
int
i
=
0
;
i
<
results
.
size
();
++
i
)
{
EXPECT_NEAR
(
out
->
data
<
float
>
()[
i
],
results
[
i
],
1e-
5
);
EXPECT_NEAR
(
out
->
data
<
float
>
()[
i
],
results
[
i
],
1e-
6
);
}
ASSERT_EQ
(
out
->
dims
().
size
(),
2
);
ASSERT_EQ
(
out
->
dims
()[
0
],
1
);
...
...
paddle/fluid/lite/core/context.h
浏览文件 @
fc3a66ae
...
...
@@ -236,12 +236,15 @@ class Context<TargetType::kOpenCL> {
void
CopySharedTo
(
const
OpenClContext
*
ctx
)
{
ctx
->
cl_context_
=
cl_context_
;
ctx
->
cl_helper_
=
cl_helper_
;
}
private:
void
PrepareKernels
()
{
cl_helper_
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
cl_helper_
->
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
cl_helper_
->
AddKernel
(
"pool_max"
,
"pool_kernel.cl"
);
cl_helper_
->
AddKernel
(
"pool_avg"
,
"pool_kernel.cl"
);
}
};
#endif
...
...
paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
浏览文件 @
fc3a66ae
...
...
@@ -31,10 +31,10 @@ class ElementwiseAddCompute
void
Run
()
override
{
auto
&
param
=
*
param_
.
get_mutable
<
param_t
>
();
auto
&
context
=
ctx_
->
As
<
OpenClContext
>
();
CHECK
(
context
.
cl_
context
()
);
CHECK
(
context
.
cl_
helper
()
!=
nullptr
);
elementwise_add
(
context
.
cl_
context
(),
static_cast
<
const
float
*>
(
param
.
X
->
raw_data
()),
context
.
cl_
helper
(),
static_cast
<
const
float
*>
(
param
.
X
->
raw_data
()),
param
.
X
->
dims
(),
static_cast
<
const
float
*>
(
param
.
Y
->
raw_data
()),
param
.
Y
->
dims
(),
param
.
Out
->
mutable_data
<
float
>
(),
param
.
Out
->
dims
());
}
...
...
paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
浏览文件 @
fc3a66ae
...
...
@@ -40,9 +40,9 @@ TEST(elementwise_add, init) {
kernel
->
SetParam
(
param
);
kernel
->
SetContext
(
std
::
move
(
context
));
X
.
Resize
({
1
,
10
});
Y
.
Resize
({
1
,
10
});
Out
.
Resize
({
1
,
10
});
X
.
Resize
({
1
,
1
,
1
,
1
0
});
Y
.
Resize
({
1
,
1
,
1
,
1
0
});
Out
.
Resize
({
1
,
1
,
1
,
1
0
});
auto
*
x_data
=
X
.
mutable_data
<
float
>
();
auto
*
y_data
=
Y
.
mutable_data
<
float
>
();
...
...
@@ -56,7 +56,7 @@ TEST(elementwise_add, init) {
kernel
->
Launch
();
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
EXPECT_NEAR
(
out_data
[
i
],
3.4
*
i
,
1e-
1
);
EXPECT_NEAR
(
out_data
[
i
],
3.4
*
i
,
1e-
6
);
}
}
...
...
paddle/fluid/lite/opencl/CMakeLists.txt
浏览文件 @
fc3a66ae
...
...
@@ -5,13 +5,11 @@ endif()
cc_library
(
cl_wrapper SRCS cl_wrapper.cc
)
cc_library
(
cl_tool SRCS cl_tool.cc
)
target_compile_options
(
cl_tool BEFORE PUBLIC -Wno-ignored-qualifiers
)
cc_library
(
cl_half SRCS cl_half.cc
)
target_compile_options
(
cl_half BEFORE PUBLIC -fno-strict-aliasing
)
cc_library
(
cl_engine SRCS cl_engine.cc DEPS cl_tool
)
cc_library
(
cl_context SRCS cl_context.cc DEPS cl_engine
)
cc_library
(
cl_helper SRCS cl_helper.cc DEPS cl_context
)
cc_library
(
cl_image_converter SRCS cl_image_converter.cc DEPS
cl_half
lite_tensor
)
cc_library
(
cl_image SRCS cl_image.cc DEPS
cl_half
lite_tensor cl_image_converter cl_engine
)
cc_library
(
cl_image_converter SRCS cl_image_converter.cc DEPS lite_tensor
)
cc_library
(
cl_image SRCS cl_image.cc DEPS lite_tensor cl_image_converter cl_engine
)
cc_library
(
cl_caller SRCS cl_caller.cc DEPS cl_helper cl_image
)
lite_cc_test
(
test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper
)
add_dependencies
(
cl_tool opencl_clhpp
)
paddle/fluid/lite/opencl/cl_caller.cc
浏览文件 @
fc3a66ae
...
...
@@ -15,7 +15,6 @@ limitations under the License. */
#include "paddle/fluid/lite/opencl/cl_caller.h"
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_context.h"
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_helper.h"
#include "paddle/fluid/lite/opencl/cl_image.h"
...
...
@@ -23,16 +22,17 @@ limitations under the License. */
namespace
paddle
{
namespace
lite
{
static
void
CopyImageData
(
const
CLImage
&
cl_image
,
float
*
out
)
{
static
void
CopyImageData
(
CLHelper
*
helper
,
const
CLImage
&
cl_image
,
float
*
out
)
{
int
width
=
cl_image
.
image_dims
()[
0
];
int
height
=
cl_image
.
image_dims
()[
1
];
half_t
*
image_data
=
new
half_
t
[
height
*
width
*
4
];
float
*
image_data
=
new
floa
t
[
height
*
width
*
4
];
cl
::
Image
*
image
=
cl_image
.
cl_image
();
const
std
::
array
<
size_t
,
3
>
origin
{
0
,
0
,
0
};
const
std
::
array
<
size_t
,
3
>
region
{
static_cast
<
size_t
>
(
width
),
static_cast
<
size_t
>
(
height
),
1
};
cl_int
err
=
CLEngine
::
Global
()
->
command_q
ueue
().
enqueueReadImage
(
cl_int
err
=
helper
->
OpenCLCommandQ
ueue
().
enqueueReadImage
(
*
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
image_data
,
nullptr
,
nullptr
);
CL_CHECK_ERRORS
(
err
);
...
...
@@ -49,22 +49,25 @@ bool InitOpenCLEngine(std::string cl_path) {
return
engine
->
IsInitSuccess
();
}
void
elementwise_add
(
CL
Context
*
context
,
const
float
*
in
,
const
DDim
&
in_dim
,
void
elementwise_add
(
CL
Helper
*
helper
,
const
float
*
in
,
const
DDim
&
in_dim
,
const
float
*
bias
,
const
DDim
&
bias_dim
,
float
*
out
,
const
DDim
&
out_dim
)
{
CLHelper
helper
(
context
);
helper
.
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
auto
kernel
=
helper
.
GetKernel
(
0
);
if
(
!
(
bias_dim
.
size
()
==
1
||
bias_dim
.
size
()
==
4
))
{
LOG
(
FATAL
)
<<
"Error: bias dims is error"
;
return
;
}
auto
kernel
=
bias_dim
.
size
()
==
1
?
helper
->
GetKernel
(
"channel_add"
)
:
helper
->
GetKernel
(
"elementwise_add"
);
CLImage
in_image
;
in_image
.
set_tensor_data
(
in
,
in_dim
);
in_image
.
InitNormalCLImage
(
helper
.
OpenCLContext
());
in_image
.
InitNormalCLImage
(
helper
->
OpenCLContext
());
VLOG
(
3
)
<<
" --- Inpu image: "
<<
in_image
<<
" --- "
;
CLImage
bias_image
;
bias_image
.
set_tensor_data
(
bias
,
bias_dim
);
bias_image
.
Init
NormalCLImage
(
helper
.
OpenCLContext
());
bias_image
.
Init
CLImage
(
helper
->
OpenCLContext
());
VLOG
(
3
)
<<
" --- Bias image: "
<<
bias_image
<<
" --- "
;
CLImage
out_image
;
out_image
.
InitEmptyImage
(
helper
.
OpenCLContext
(),
out_dim
);
out_image
.
InitEmptyImage
(
helper
->
OpenCLContext
(),
out_dim
);
cl_int
status
;
status
=
kernel
.
setArg
(
0
,
*
in_image
.
cl_image
());
CL_CHECK_ERRORS
(
status
);
...
...
@@ -72,16 +75,23 @@ void elementwise_add(CLContext* context, const float* in, const DDim& in_dim,
CL_CHECK_ERRORS
(
status
);
status
=
kernel
.
setArg
(
2
,
*
out_image
.
cl_image
());
CL_CHECK_ERRORS
(
status
);
if
(
bias_dim
.
size
()
==
1
)
{
int
tensor_w
=
in_dim
[
3
];
status
=
kernel
.
setArg
(
3
,
tensor_w
);
CL_CHECK_ERRORS
(
status
);
}
size_t
width
=
in_image
.
ImageWidth
();
size_t
height
=
in_image
.
ImageHeight
();
auto
global_work_size
=
cl
::
NDRange
{
width
,
height
};
status
=
helper
.
OpenCLCommandQueue
().
enqueueNDRangeKernel
(
status
=
helper
->
OpenCLCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
nullptr
);
CL_CHECK_ERRORS
(
status
);
status
=
helper
->
OpenCLCommandQueue
().
finish
();
CL_CHECK_ERRORS
(
status
);
VLOG
(
3
)
<<
" --- Out image: "
<<
out_image
<<
" --- "
;
CopyImageData
(
out_image
,
out
);
CopyImageData
(
helper
,
out_image
,
out
);
}
}
// namespace lite
...
...
paddle/fluid/lite/opencl/cl_caller.h
浏览文件 @
fc3a66ae
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_
context
.h"
#include "paddle/fluid/lite/opencl/cl_
helper
.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -27,7 +27,7 @@ bool InitOpenCLEngine(std::string cl_path);
/// black box so that the framework can remain simple.
/// NOTE Currently, these methods are quite expensive, we will optimize them
/// latter.
void
elementwise_add
(
CL
Context
*
context
,
const
float
*
in
,
const
DDim
&
in_dim
,
void
elementwise_add
(
CL
Helper
*
helper
,
const
float
*
in
,
const
DDim
&
in_dim
,
const
float
*
bias
,
const
DDim
&
bias_dim
,
float
*
out
,
const
DDim
&
out_dim
);
...
...
paddle/fluid/lite/opencl/cl_engine.cc
浏览文件 @
fc3a66ae
...
...
@@ -156,8 +156,7 @@ bool CLEngine::InitializeDevice() {
if
(
ext_data
.
find
(
"cl_khr_fp16"
)
!=
std
::
string
::
npos
)
{
LOG
(
INFO
)
<<
"The chosen device supports the half data type."
;
}
else
{
LOG
(
ERROR
)
<<
"The chosen device doesn't support the half data type!"
;
return
false
;
LOG
(
INFO
)
<<
"The chosen device doesn't support the half data type!"
;
}
auto
max_units
=
device_
->
getInfo
<
CL_DEVICE_MAX_COMPUTE_UNITS
>
();
LOG
(
INFO
)
<<
"The chosen device has "
<<
max_units
<<
" compute units."
;
...
...
paddle/fluid/lite/opencl/cl_half.cc
已删除
100644 → 0
浏览文件 @
679aabac
此差异已折叠。
点击以展开。
paddle/fluid/lite/opencl/cl_image.cc
浏览文件 @
fc3a66ae
...
...
@@ -16,7 +16,6 @@ limitations under the License. */
#include <glog/logging.h>
#include <array>
#include "paddle/fluid/lite/opencl/cl_engine.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
#include "paddle/fluid/lite/opencl/cl_tool.h"
namespace
paddle
{
...
...
@@ -26,7 +25,7 @@ std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
int
width
=
cl_image
.
image_dims_
[
0
];
int
height
=
cl_image
.
image_dims_
[
1
];
half_t
*
image_data
=
new
half_
t
[
height
*
width
*
4
];
float
*
image_data
=
new
floa
t
[
height
*
width
*
4
];
cl
::
Image
*
image
=
cl_image
.
cl_image
();
const
std
::
array
<
size_t
,
3
>
origin
{
0
,
0
,
0
};
const
std
::
array
<
size_t
,
3
>
region
{
static_cast
<
size_t
>
(
width
),
...
...
@@ -131,9 +130,9 @@ void CLImage::InitCLImage(const cl::Context& context,
image_dims_
=
converter
->
InitImageDimInfoWith
(
tensor_dims_
);
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
half_t
*
image_data
=
new
half_
t
[
image_dims_
.
product
()
*
4
];
float
*
image_data
=
new
floa
t
[
image_dims_
.
product
()
*
4
];
#else
half_t
*
image_data
=
new
half_
t
[
image_dims_
.
production
()
*
4
];
float
*
image_data
=
new
floa
t
[
image_dims_
.
production
()
*
4
];
#endif
VLOG
(
3
)
<<
" convert to image "
;
...
...
@@ -151,7 +150,7 @@ void CLImage::InitCLImage(const cl::Context& context,
void
CLImage
::
InitCLImage
(
const
cl
::
Context
&
context
,
int
width
,
int
height
,
void
*
data
)
{
cl
::
ImageFormat
img_format
(
CL_RGBA
,
CL_
HALF_
FLOAT
);
cl
::
ImageFormat
img_format
(
CL_RGBA
,
CL_FLOAT
);
cl_int
err
;
cl_image_
.
reset
(
new
cl
::
Image2D
(
context
,
CL_MEM_READ_WRITE
|
(
data
?
CL_MEM_COPY_HOST_PTR
:
0
),
...
...
paddle/fluid/lite/opencl/cl_image_converter.cc
浏览文件 @
fc3a66ae
...
...
@@ -36,7 +36,7 @@ DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterDefault
::
NCHWToImage
(
float
*
nchw
,
half_
t
*
image
,
void
CLImageConverterDefault
::
NCHWToImage
(
float
*
nchw
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
size_t
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
...
...
@@ -68,7 +68,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
if
(
c
<
C
)
{
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image
[
i2
]
=
Float2Half
(
*
p
)
;
image
[
i2
]
=
*
p
;
i2
+=
4
;
p
++
;
}
else
{
...
...
@@ -83,7 +83,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
}
}
void
CLImageConverterDefault
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterDefault
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
...
...
@@ -107,7 +107,7 @@ void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
size_t
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
image
[
i2
])
;
*
p
=
image
[
i2
]
;
i2
+=
4
;
p
++
;
}
...
...
@@ -161,7 +161,7 @@ DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
}
}
void
CLImageConverterFolder
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterFolder
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
<=
4
&&
tensor_dim
.
size
()
>
0
)
<<
" Tensor dim is not support!"
;
...
...
@@ -184,14 +184,13 @@ void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
for
(
size_t
h
=
0
;
h
<
tdim
[
0
];
h
++
)
{
for
(
size_t
w
=
0
;
w
<
tdim
[
1
];
w
++
)
{
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
Float2Half
(
tensor
[
h
*
tdim
[
1
]
+
w
]);
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
=
tensor
[
h
*
tdim
[
1
]
+
w
];
}
}
}
}
void
CLImageConverterFolder
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterFolder
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
if
(
tensor_dim
.
size
()
>
2
)
{
...
...
@@ -213,7 +212,7 @@ void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
h
=
0
;
h
<
H
;
h
++
)
{
for
(
size_t
w
=
0
;
w
<
W
;
w
++
)
{
p
[
h
*
W
+
w
]
=
Half2Float
(
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)])
;
p
[
h
*
W
+
w
]
=
image
[(
h
*
width
+
w
/
4
)
*
4
+
(
w
%
4
)]
;
}
}
}
...
...
@@ -233,7 +232,7 @@ DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterNWBlock
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterNWBlock
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
==
4
)
<<
" Tensor dim is not 4."
;
auto
image_dim
=
InitImageDimInfoWith
(
tensor_dim
);
...
...
@@ -253,7 +252,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
size_t
index
=
4
*
c
*
(
width
*
H
)
+
4
*
h
*
width
+
4
*
W
*
(
n
/
4
)
+
w
*
4
+
n
%
4
;
if
(
n
<
N
)
{
image
[
index
]
=
Float2Half
(
*
p
)
;
image
[
index
]
=
*
p
;
p
++
;
}
else
{
image
[
index
]
=
0.0
;
...
...
@@ -268,7 +267,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
VLOG
(
3
)
<<
" init done"
;
}
void
CLImageConverterNWBlock
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterNWBlock
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
==
4
)
<<
" Tensor dim is not 4."
;
...
...
@@ -286,7 +285,7 @@ void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
w
=
0
;
w
<
W
;
++
w
)
{
size_t
index
=
4
*
c
*
(
width
*
H
)
+
4
*
h
*
width
+
4
*
W
*
(
n
/
4
)
+
w
*
4
+
n
%
4
;
*
p
=
Half2Float
(
image
[
index
])
;
*
p
=
image
[
index
]
;
p
++
;
if
(
index
>=
(
width
*
height
*
4
))
{
LOG
(
INFO
)
<<
" index out of range "
;
...
...
@@ -312,7 +311,7 @@ DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterDWBlock
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterDWBlock
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
size_t
new_dims
[]
=
{
1
,
1
,
1
,
1
};
for
(
size_t
j
=
0
;
j
<
tensor_dim
.
size
();
++
j
)
{
...
...
@@ -344,7 +343,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
if
(
c
<
C
)
{
// size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
// (c % 4);
image
[
i2
]
=
Float2Half
(
*
p
)
;
image
[
i2
]
=
*
p
;
i2
+=
4
;
p
++
;
}
else
{
...
...
@@ -359,7 +358,7 @@ void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
}
}
void
CLImageConverterDWBlock
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterDWBlock
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
==
4
)
<<
" Tensor dim is not 4."
;
...
...
@@ -377,7 +376,7 @@ void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
for
(
size_t
h
=
0
;
h
<
H
;
h
++
)
{
size_t
i2
=
(
i1
<<
2
)
+
c
%
4
;
for
(
size_t
w
=
0
;
w
<
W
;
w
++
)
{
*
p
=
Half2Float
(
image
[
i2
])
;
*
p
=
image
[
i2
]
;
i2
+=
4
;
p
++
;
}
...
...
@@ -410,7 +409,7 @@ DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterNormal
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterNormal
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{
CHECK
(
tensor_dim
.
size
()
<=
4
&&
tensor_dim
.
size
()
>
0
)
<<
" Tensor dim is not support!"
;
...
...
@@ -419,7 +418,7 @@ void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
default_converter
.
NCHWToImage
(
tensor
,
image
,
tensor_dim
);
}
void
CLImageConverterNormal
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterNormal
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{
CLImageConverterDefault
default_converter
;
...
...
@@ -439,10 +438,10 @@ DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
static_cast
<
DDim
::
value_type
>
(
height
)}));
}
void
CLImageConverterWinoTransWeight
::
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
void
CLImageConverterWinoTransWeight
::
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
{}
void
CLImageConverterWinoTransWeight
::
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
void
CLImageConverterWinoTransWeight
::
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
{}
...
...
paddle/fluid/lite/opencl/cl_image_converter.h
浏览文件 @
fc3a66ae
...
...
@@ -15,7 +15,6 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/lite/core/compatible_tensor.h"
#include "paddle/fluid/lite/opencl/cl_half.h"
namespace
paddle
{
namespace
lite
{
...
...
@@ -24,10 +23,10 @@ class CLImageConverterBase {
public:
virtual
~
CLImageConverterBase
()
{}
virtual
void
NCHWToImage
(
float
*
nchw
,
half_
t
*
image
,
virtual
void
NCHWToImage
(
float
*
nchw
,
floa
t
*
image
,
const
DDim
&
tensor_dim
)
=
0
;
virtual
void
ImageToNCHW
(
half_
t
*
image
,
float
*
nchw
,
const
DDim
&
image_dim
,
virtual
void
ImageToNCHW
(
floa
t
*
image
,
float
*
nchw
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
)
=
0
;
virtual
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
)
=
0
;
};
...
...
@@ -35,16 +34,16 @@ class CLImageConverterBase {
class
CLImageConverterDefault
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
nchw
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
nchw
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
class
CLImageConverterFolder
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
/*
...
...
@@ -68,8 +67,8 @@ class CLImageConverterFolder : public CLImageConverterBase {
class
CLImageConverterNormal
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
/*
...
...
@@ -92,22 +91,22 @@ class CLImageConverterNormal : public CLImageConverterBase {
class
CLImageConverterNWBlock
:
public
CLImageConverterBase
{
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
class
CLImageConverterDWBlock
:
public
CLImageConverterBase
{
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
class
CLImageConverterWinoTransWeight
:
public
CLImageConverterBase
{
public:
DDim
InitImageDimInfoWith
(
const
DDim
&
tensor_dim
);
void
NCHWToImage
(
float
*
tensor
,
half_
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
half_
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
void
NCHWToImage
(
float
*
tensor
,
floa
t
*
image
,
const
DDim
&
tensor_dim
);
void
ImageToNCHW
(
floa
t
*
image
,
float
*
tensor
,
const
DDim
&
image_dim
,
const
DDim
&
tensor_dim
);
};
...
...
paddle/fluid/lite/opencl/cl_
half.h
→
paddle/fluid/lite/opencl/cl_
kernel/channel_add_kernel.cl
浏览文件 @
fc3a66ae
...
...
@@ -12,21 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#pragma once
#include <cstdint>
namespace
paddle
{
namespace
lite
{
typedef
uint16_t
half_t
;
half_t
Float2Half
(
float
f
);
float
Half2Float
(
half_t
h
);
void
FloatArray2HalfArray
(
float
*
f_array
,
half_t
*
h_array
,
int
count
);
void
HalfArray2FloatArray
(
half_t
*
h_array
,
float
*
f_array
,
int
count
);
}
// namespace lite
}
// namespace paddle
__kernel
void
channel_add
(
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
outputImage,
__private
const
int
w
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
int2
coords_bias
;
coords_bias.x
=
x/w
;
coords_bias.y
=
0
;
float4
in
=
read_imagef
(
input,
sampler,
coords
)
;
float4
biase
=
read_imagef
(
bias,
sampler,
coords_bias
)
;
float4
output
=
in
+
biase
;
write_imagef
(
outputImage,
coords,
output
)
;
}
paddle/fluid/lite/opencl/cl_kernel/cl_common.h
浏览文件 @
fc3a66ae
...
...
@@ -14,21 +14,19 @@ limitations under the License. */
#pragma once
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
inline
half4
activation
(
half4
in
inline
float4
activation
(
float4
in
#ifdef PRELU
,
half
4
prelu_alpha
float
4
prelu_alpha
#endif
)
{
half
4
output
;
float
4
output
;
#ifdef PRELU
output
=
select
(
prelu_alpha
*
in
,
in
,
in
>=
(
half
4
)
0
.
0
);
output
=
select
(
prelu_alpha
*
in
,
in
,
in
>=
(
float
4
)
0
.
0
);
#endif
#ifdef RELU
output
=
fmax
(
in
,
(
half
4
)(
0
.
0
f
));
output
=
fmax
(
in
,
(
float
4
)(
0
.
0
f
));
#endif
return
output
;
}
paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
浏览文件 @
fc3a66ae
...
...
@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
elementwise_add
(
__global
image2d_t
input,
__global
image2d_t
bias,__write_only
image2d_t
outputImage
)
{
__kernel
void
elementwise_add
(
__read_only
image2d_t
input,
__read_only
image2d_t
bias,
__write_only
image2d_t
outputImage
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
half4
in
=
read_imageh
(
input,
sampler,
coords
)
;
half4
biase
=
read_imageh
(
bias,
sampler,
coords
)
;
half
4
output
=
in
+
biase
;
write_image
h
(
outputImage,coords,output
)
;
float4
in
=
read_imagef
(
input,
sampler,
coords
)
;
float4
biase
=
read_imagef
(
bias,
sampler,
coords
)
;
float
4
output
=
in
+
biase
;
write_image
f
(
outputImage,coords,output
)
;
}
paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
浏览文件 @
fc3a66ae
...
...
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See
the
License
for
the
specific
language
governing
permissions
and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
#
define
MIN_VALUE
-FLT_MAX
__kernel
void
pool_max
(
...
...
@@ -41,16 +40,16 @@ __kernel void pool_max(
const
int
pos_in_x
=
out_c
*
in_width
;
const
int
pos_in_y
=
out_n
*
in_height
;
half4
max_value
=
(
half
4
)(
MIN_VALUE
)
;
float4
max_value
=
(
float
4
)(
MIN_VALUE
)
;
for
(
int
y
=
start_h
; y < end_h; ++y) {
for
(
int
x
=
start_w
; x < end_w; ++x) {
half4
tmp
=
read_imageh
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
float4
tmp
=
read_imagef
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
max_value
=
max
(
max_value,
tmp
)
;
}
}
const
int
pos_out_x
=
mad24
(
out_c,
out_width,
out_w
)
;
write_image
h
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
max_value
)
;
write_image
f
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
max_value
)
;
}
__kernel
void
pool_avg
(
...
...
@@ -77,15 +76,15 @@ __kernel void pool_avg(
const
int
pos_in_x
=
out_c
*
in_width
;
const
int
pos_in_y
=
out_n
*
in_height
;
half4
sum
=
(
half
4
)(
0.0f
)
;
float4
sum
=
(
float
4
)(
0.0f
)
;
int
num
=
0
;
for
(
int
y
=
start_h
; y < end_h; ++y) {
for
(
int
x
=
start_w
; x < end_w; ++x) {
sum
+=
read_image
h
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
sum
+=
read_image
f
(
input,
sampler,
(
int2
)(
pos_in_x
+
x,
pos_in_y
+
y
))
;
num++
;
}
}
half
4
avg
=
sum
/
num
;
float
4
avg
=
sum
/
num
;
const
int
pos_out_x
=
mad24
(
out_c,
out_width,
out_w
)
;
write_image
h
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
avg
)
;
write_image
f
(
output,
(
int2
)(
pos_out_x,
out_nh
)
,
avg
)
;
}
paddle/fluid/lite/opencl/cl_test.cc
浏览文件 @
fc3a66ae
...
...
@@ -67,28 +67,28 @@ TEST(cl_test, kernel_test) {
helper
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
auto
kernel
=
helper
->
GetKernel
(
2
);
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
1024
*
512
]);
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
++
)
{
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
4
*
3
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
3
*
256
*
512
;
i
++
)
{
in_data
[
i
]
=
1.
f
;
}
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
3
,
256
,
512
});
CLImage
in_image
;
in_image
.
set_tensor_data
(
in_data
.
get
(),
in_dim
);
in_image
.
InitNormalCLImage
(
helper
->
OpenCLContext
());
LOG
(
INFO
)
<<
in_image
;
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
1024
*
512
]);
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
++
)
{
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
4
*
3
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
3
*
256
*
512
;
i
++
)
{
bias_data
[
i
]
=
2.
f
;
}
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
3
,
256
,
512
});
CLImage
bias_image
;
bias_image
.
set_tensor_data
(
bias_data
.
get
(),
bias_dim
);
bias_image
.
InitNormalCLImage
(
helper
->
OpenCLContext
());
LOG
(
INFO
)
<<
bias_image
;
CLImage
out_image
;
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
3
,
256
,
512
});
out_image
.
InitEmptyImage
(
helper
->
OpenCLContext
(),
out_dim
);
LOG
(
INFO
)
<<
out_image
;
...
...
@@ -108,7 +108,8 @@ TEST(cl_test, kernel_test) {
status
=
helper
->
OpenCLCommandQueue
().
enqueueNDRangeKernel
(
kernel
,
cl
::
NullRange
,
global_work_size
,
cl
::
NullRange
,
nullptr
,
&
event
);
CL_CHECK_ERRORS
(
status
);
status
=
helper
->
OpenCLCommandQueue
().
finish
();
CL_CHECK_ERRORS
(
status
);
double
start_nanos
=
event
.
getProfilingInfo
<
CL_PROFILING_COMMAND_START
>
();
double
stop_nanos
=
event
.
getProfilingInfo
<
CL_PROFILING_COMMAND_END
>
();
double
elapsed_micros
=
(
stop_nanos
-
start_nanos
)
/
1000.0
;
...
...
@@ -116,37 +117,99 @@ TEST(cl_test, kernel_test) {
LOG
(
INFO
)
<<
out_image
;
}
TEST
(
cl_test
,
elementwise
_add_test
)
{
TEST
(
cl_test
,
channel
_add_test
)
{
std
::
default_random_engine
engine
;
std
::
uniform_real_distribution
<
float
>
dist
(
-
5
,
5
);
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
1024
*
512
]);
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
++
)
{
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
in_data
[
i
]
=
dist
(
engine
);
}
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1
024
,
512
});
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
1
024
*
512
]);
for
(
int
i
=
0
;
i
<
1
024
*
512
;
i
++
)
{
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1
6
});
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
1
6
]);
for
(
int
i
=
0
;
i
<
1
6
;
i
++
)
{
bias_data
[
i
]
=
dist
(
engine
);
}
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
1024
,
512
});
std
::
unique_ptr
<
float
[]
>
out
(
new
float
[
1024
*
512
]);
std
::
unique_ptr
<
float
[]
>
out_ref
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
;
i
++
)
{
for
(
int
j
=
0
;
j
<
16
;
j
++
)
{
float
b
=
bias_data
[
j
];
for
(
int
k
=
0
;
k
<
256
*
512
;
k
++
)
{
int
index
=
(
i
*
16
+
j
)
*
256
*
512
+
k
;
out_ref
[
index
]
=
in_data
[
index
]
+
b
;
}
}
}
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
out
(
new
float
[
4
*
16
*
256
*
512
]);
bool
status
=
InitOpenCLEngine
(
FLAGS_cl_path
);
CHECK
(
status
)
<<
"Fail to initialize OpenCL engine."
;
CLContext
context
;
std
::
unique_ptr
<
CLContext
>
context
(
new
CLContext
);
std
::
unique_ptr
<
CLHelper
>
helper
(
new
CLHelper
(
context
.
get
()));
helper
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
helper
->
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
elementwise_add
(
helper
.
get
(),
in_data
.
get
(),
in_dim
,
bias_data
.
get
(),
bias_dim
,
out
.
get
(),
out_dim
);
elementwise_add
(
&
context
,
in_data
.
get
(),
in_dim
,
bias_data
.
get
(),
bias_dim
,
out
.
get
(),
out_dim
);
int
stride
=
4
*
16
*
256
*
512
/
20
;
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
+=
stride
)
{
std
::
cout
<<
out
[
i
]
<<
" "
;
}
int
stride
=
1024
*
512
/
20
;
for
(
int
i
=
0
;
i
<
1024
*
512
;
i
+=
stride
)
{
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
EXPECT_NEAR
(
out
[
i
],
out_ref
[
i
],
1e-6
);
}
std
::
cout
<<
std
::
endl
;
}
TEST
(
cl_test
,
elementwise_add_test
)
{
std
::
default_random_engine
engine
;
std
::
uniform_real_distribution
<
float
>
dist
(
-
5
,
5
);
const
DDim
in_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
in_data
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
in_data
[
i
]
=
dist
(
engine
);
}
const
DDim
bias_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
bias_data
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
bias_data
[
i
]
=
dist
(
engine
);
}
std
::
unique_ptr
<
float
[]
>
out_ref
(
new
float
[
4
*
16
*
256
*
512
]);
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
out_ref
[
i
]
=
in_data
[
i
]
+
bias_data
[
i
];
}
const
DDim
out_dim
=
DDim
(
std
::
vector
<
DDim
::
value_type
>
{
4
,
16
,
256
,
512
});
std
::
unique_ptr
<
float
[]
>
out
(
new
float
[
4
*
16
*
256
*
512
]);
bool
status
=
InitOpenCLEngine
(
FLAGS_cl_path
);
CHECK
(
status
)
<<
"Fail to initialize OpenCL engine."
;
std
::
unique_ptr
<
CLContext
>
context
(
new
CLContext
);
std
::
unique_ptr
<
CLHelper
>
helper
(
new
CLHelper
(
context
.
get
()));
helper
->
AddKernel
(
"elementwise_add"
,
"elementwise_add_kernel.cl"
);
helper
->
AddKernel
(
"channel_add"
,
"channel_add_kernel.cl"
);
elementwise_add
(
helper
.
get
(),
in_data
.
get
(),
in_dim
,
bias_data
.
get
(),
bias_dim
,
out
.
get
(),
out_dim
);
int
stride
=
4
*
16
*
256
*
512
/
20
;
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
+=
stride
)
{
std
::
cout
<<
out
[
i
]
<<
" "
;
}
for
(
int
i
=
0
;
i
<
4
*
16
*
256
*
512
;
i
++
)
{
EXPECT_NEAR
(
out
[
i
],
out_ref
[
i
],
1e-6
);
}
std
::
cout
<<
std
::
endl
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录