Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
6554854a
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6554854a
编写于
12月 27, 2019
作者:
L
Liu Yiqun
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into step_rnn/opt_ddim_lite
test=develop
上级
aef8084f
9171b70e
变更
57
隐藏空白更改
内联
并排
Showing
57 changed file
with
3377 addition
and
2150 deletion
+3377
-2150
cmake/cross_compiling/ios.cmake
cmake/cross_compiling/ios.cmake
+1
-0
lite/api/CMakeLists.txt
lite/api/CMakeLists.txt
+20
-0
lite/api/lite_multithread_test.cc
lite/api/lite_multithread_test.cc
+360
-0
lite/backends/x86/cpu_info.cc
lite/backends/x86/cpu_info.cc
+29
-19
lite/backends/x86/dynamic_loader.cc
lite/backends/x86/dynamic_loader.cc
+52
-42
lite/backends/x86/jit/gen_base.cc
lite/backends/x86/jit/gen_base.cc
+3
-1
lite/backends/x86/jit/gen_base.h
lite/backends/x86/jit/gen_base.h
+3
-2
lite/backends/x86/math/beam_search.cc
lite/backends/x86/math/beam_search.cc
+2
-1
lite/backends/x86/math/detail/avx_mathfun.h
lite/backends/x86/math/detail/avx_mathfun.h
+132
-125
lite/core/kernel.h
lite/core/kernel.h
+3
-6
lite/core/memory.h
lite/core/memory.h
+1
-0
lite/core/profile/profiler.cc
lite/core/profile/profiler.cc
+50
-28
lite/core/profile/profiler.h
lite/core/profile/profiler.h
+21
-5
lite/core/profile/test_timer.cc
lite/core/profile/test_timer.cc
+3
-3
lite/core/program.cc
lite/core/program.cc
+9
-4
lite/core/program.h
lite/core/program.h
+2
-1
lite/core/tensor.h
lite/core/tensor.h
+4
-0
lite/kernels/arm/conditional_block_compute.cc
lite/kernels/arm/conditional_block_compute.cc
+3
-0
lite/kernels/arm/split_lod_tensor_compute.cc
lite/kernels/arm/split_lod_tensor_compute.cc
+4
-0
lite/kernels/arm/unsqueeze_compute.cc
lite/kernels/arm/unsqueeze_compute.cc
+4
-4
lite/kernels/arm/yolo_box_compute.cc
lite/kernels/arm/yolo_box_compute.cc
+2
-1
lite/kernels/cuda/softmax_compute.cu
lite/kernels/cuda/softmax_compute.cu
+13
-15
lite/kernels/cuda/softmax_compute.h
lite/kernels/cuda/softmax_compute.h
+5
-3
lite/kernels/npu/bridges/engine.h
lite/kernels/npu/bridges/engine.h
+5
-2
lite/kernels/npu/subgraph_compute.cc
lite/kernels/npu/subgraph_compute.cc
+2
-1
lite/kernels/npu/subgraph_compute.h
lite/kernels/npu/subgraph_compute.h
+3
-2
lite/kernels/x86/gru_compute.cc
lite/kernels/x86/gru_compute.cc
+6
-3
lite/kernels/x86/gru_compute.h
lite/kernels/x86/gru_compute.h
+3
-2
lite/kernels/xpu/bridges/matmul_op.cc
lite/kernels/xpu/bridges/matmul_op.cc
+64
-6
lite/kernels/xpu/bridges/mul_op.cc
lite/kernels/xpu/bridges/mul_op.cc
+15
-3
lite/kernels/xpu/subgraph_compute.cc
lite/kernels/xpu/subgraph_compute.cc
+2
-1
lite/kernels/xpu/subgraph_compute.h
lite/kernels/xpu/subgraph_compute.h
+3
-2
lite/operators/CMakeLists.txt
lite/operators/CMakeLists.txt
+1
-2
lite/operators/attention_padding_mask_op.cc
lite/operators/attention_padding_mask_op.cc
+3
-1
lite/operators/instance_norm_op.cc
lite/operators/instance_norm_op.cc
+3
-2
lite/operators/reduce_prod_op.cc
lite/operators/reduce_prod_op.cc
+1
-1
lite/tests/kernels/CMakeLists.txt
lite/tests/kernels/CMakeLists.txt
+1
-0
lite/tests/kernels/mul_compute_test.cc
lite/tests/kernels/mul_compute_test.cc
+145
-0
lite/tests/kernels/unsqueeze_compute_test.cc
lite/tests/kernels/unsqueeze_compute_test.cc
+2
-0
lite/tools/ci_build.sh
lite/tools/ci_build.sh
+0
-17
lite/utils/env.h
lite/utils/env.h
+71
-0
mobile/src/framework/cl/cl_image.cpp
mobile/src/framework/cl/cl_image.cpp
+31
-0
mobile/src/framework/cl/cl_image.h
mobile/src/framework/cl/cl_image.h
+2
-0
mobile/src/framework/cl/cl_tool.h
mobile/src/framework/cl/cl_tool.h
+8
-7
mobile/src/framework/executor.cpp
mobile/src/framework/executor.cpp
+4
-1
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+15
-2
mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+2066
-1808
mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
...c/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+87
-19
mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+11
-0
mobile/src/operators/kernel/cl/conv_add_kernel.cpp
mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+9
-0
mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+9
-0
mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+9
-0
mobile/src/operators/kernel/cl/conv_kernel.cpp
mobile/src/operators/kernel/cl/conv_kernel.cpp
+9
-0
mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+9
-0
mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+49
-8
mobile/src/operators/op_param.h
mobile/src/operators/op_param.h
+1
-0
mobile/test/net/test_net_multi_feed.cpp
mobile/test/net/test_net_multi_feed.cpp
+2
-0
未找到文件。
cmake/cross_compiling/ios.cmake
浏览文件 @
6554854a
...
...
@@ -120,6 +120,7 @@
#
## Lite settings
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-flto"
)
if
(
ARM_TARGET_OS STREQUAL
"ios"
)
set
(
PLATFORM
"OS"
)
elseif
(
ARM_TARGET_OS STREQUAL
"ios64"
)
...
...
lite/api/CMakeLists.txt
浏览文件 @
6554854a
...
...
@@ -305,6 +305,26 @@ if(NOT IOS)
FPGA_DEPS
${
fpga_kernels
}
X86_DEPS
${
x86_kernels
}
CUDA_DEPS
${
cuda_kernels
}
)
lite_cc_binary
(
benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${
ops
}
${
host_kernels
}
ARM_DEPS
${
arm_kernels
}
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
CL_DEPS
${
opencl_kernels
}
FPGA_DEPS
${
fpga_kernels
}
X86_DEPS
${
x86_kernels
}
CUDA_DEPS
${
cuda_kernels
}
)
lite_cc_binary
(
multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${
ops
}
${
host_kernels
}
ARM_DEPS
${
arm_kernels
}
CV_DEPS paddle_cv_arm
NPU_DEPS
${
npu_kernels
}
XPU_DEPS
${
xpu_kernels
}
CL_DEPS
${
opencl_kernels
}
FPGA_DEPS
${
fpga_kernels
}
X86_DEPS
${
x86_kernels
}
CUDA_DEPS
${
cuda_kernels
}
)
endif
()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
...
...
lite/api/lite_multithread_test.cc
0 → 100644
浏览文件 @
6554854a
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <string>
#include <vector>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/device_info.h"
#include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
#include <thread> // NOLINT
using
paddle
::
lite
::
profile
::
Timer
;
DEFINE_string
(
input_shape
,
"1,3,224,224"
,
"input shapes, separated by colon and comma"
);
DEFINE_string
(
model_dir_0
,
""
,
"model_dir_0"
);
DEFINE_string
(
input_shape_0
,
"1,3,224,224"
,
"input shapes another, separated by colon and comma"
);
DEFINE_bool
(
use_optimize_nb
,
false
,
"optimized & naive buffer model for mobile devices"
);
DEFINE_int32
(
test_type
,
0
,
"multithread test type"
);
namespace
paddle
{
namespace
lite_api
{
void
OutputOptModel
(
const
std
::
string
&
load_model_dir
,
const
std
::
string
&
save_optimized_model_dir
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
)
{
lite_api
::
CxxConfig
config
;
config
.
set_model_dir
(
load_model_dir
);
config
.
set_valid_places
({
Place
{
TARGET
(
kARM
),
PRECISION
(
kFloat
)},
});
auto
predictor
=
lite_api
::
CreatePaddlePredictor
(
config
);
// delete old optimized model
int
ret
=
system
(
paddle
::
lite
::
string_format
(
"rm -rf %s"
,
save_optimized_model_dir
.
c_str
())
.
c_str
());
if
(
ret
==
0
)
{
LOG
(
INFO
)
<<
"delete old optimized model "
<<
save_optimized_model_dir
;
}
predictor
->
SaveOptimizedModel
(
save_optimized_model_dir
,
LiteModelType
::
kNaiveBuffer
);
LOG
(
INFO
)
<<
"Load model from "
<<
load_model_dir
;
LOG
(
INFO
)
<<
"Save optimized model to "
<<
save_optimized_model_dir
;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void
Run
(
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
,
const
std
::
string
&
model_dir
,
const
PowerMode
power_mode
,
const
int
thread_num
,
const
int
repeat
,
int
tid
,
const
int
warmup_times
=
5
)
{
lite_api
::
MobileConfig
config
;
config
.
set_model_dir
(
model_dir
);
config
.
set_power_mode
(
power_mode
);
config
.
set_threads
(
thread_num
);
auto
predictor
=
lite_api
::
CreatePaddlePredictor
(
config
);
for
(
int
j
=
0
;
j
<
input_shapes
.
size
();
++
j
)
{
auto
input_tensor
=
predictor
->
GetInput
(
j
);
input_tensor
->
Resize
(
input_shapes
[
j
]);
auto
input_data
=
input_tensor
->
mutable_data
<
float
>
();
int
input_num
=
1
;
for
(
int
i
=
0
;
i
<
input_shapes
[
j
].
size
();
++
i
)
{
input_num
*=
input_shapes
[
j
][
i
];
}
for
(
int
i
=
0
;
i
<
input_num
;
++
i
)
{
input_data
[
i
]
=
1.
f
;
}
}
for
(
int
i
=
0
;
i
<
warmup_times
;
++
i
)
{
predictor
->
Run
();
}
Timer
ti
;
for
(
int
j
=
0
;
j
<
repeat
;
++
j
)
{
ti
.
Start
();
predictor
->
Run
();
float
t
=
ti
.
Stop
();
auto
output
=
predictor
->
GetOutput
(
0
);
auto
out
=
output
->
data
<
float
>
();
LOG
(
INFO
)
<<
"[thread "
<<
tid
<<
"] Model: "
<<
model_dir
<<
" output[0]:"
<<
out
[
0
]
<<
"; output[1]:"
<<
out
[
1
];
}
LOG
(
INFO
)
<<
"[thread "
<<
tid
<<
"] Model: "
<<
model_dir
<<
", power_mode: "
<<
static_cast
<
int
>
(
power_mode
)
<<
", threads num "
<<
thread_num
<<
", avg time: "
<<
ti
.
LapTimes
().
Avg
()
<<
"ms"
<<
", min time: "
<<
ti
.
LapTimes
().
Min
()
<<
" ms"
<<
", max time: "
<<
ti
.
LapTimes
().
Max
()
<<
" ms."
;
}
void
RunTestType_00
(
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
,
const
std
::
string
&
model_dir
,
const
PowerMode
power_mode
,
const
int
thread_num
,
const
int
repeat
,
const
int
warmup_times
=
5
)
{
std
::
thread
run_th0
(
Run
,
input_shapes
,
model_dir
,
power_mode
,
thread_num
,
repeat
,
0
,
warmup_times
);
Run
(
input_shapes
,
model_dir
,
power_mode
,
thread_num
,
repeat
,
1
,
warmup_times
);
run_th0
.
join
();
}
void
RunTestType_01
(
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
,
const
std
::
string
&
model_dir
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes_0
,
const
std
::
string
&
model_dir_0
,
const
PowerMode
power_mode
,
const
int
thread_num
,
const
int
repeat
,
const
int
warmup_times
=
5
)
{
std
::
thread
run_th0
(
Run
,
input_shapes
,
model_dir
,
power_mode
,
thread_num
,
repeat
,
0
,
warmup_times
);
Run
(
input_shapes_0
,
model_dir_0
,
power_mode
,
thread_num
,
repeat
,
1
,
warmup_times
);
run_th0
.
join
();
}
void
run_with_predictor
(
std
::
shared_ptr
<
PaddlePredictor
>
predictor
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
,
int
index
,
const
std
::
string
&
name
)
{
for
(
int
j
=
0
;
j
<
input_shapes
.
size
();
++
j
)
{
auto
input_tensor
=
predictor
->
GetInput
(
j
);
input_tensor
->
Resize
(
input_shapes
[
j
]);
auto
input_data
=
input_tensor
->
mutable_data
<
float
>
();
int
input_num
=
1
;
for
(
int
i
=
0
;
i
<
input_shapes
[
j
].
size
();
++
i
)
{
input_num
*=
input_shapes
[
j
][
i
];
}
for
(
int
i
=
0
;
i
<
input_num
;
++
i
)
{
input_data
[
i
]
=
1.
f
;
}
}
Timer
ti
;
ti
.
Start
();
predictor
->
Run
();
float
t
=
ti
.
Stop
();
auto
output
=
predictor
->
GetOutput
(
0
);
auto
out
=
output
->
data
<
float
>
();
LOG
(
INFO
)
<<
"[thread "
<<
index
<<
"] name: "
<<
name
<<
",run time: "
<<
ti
.
LapTimes
().
Avg
()
<<
"ms"
<<
" output[0]:"
<<
out
[
0
]
<<
"; output[1]:"
<<
out
[
1
];
}
void
RunTestType_10
(
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
,
const
std
::
string
&
model_dir
,
const
PowerMode
power_mode
,
const
int
thread_num
,
const
int
repeat
,
int
warmup
=
5
)
{
lite_api
::
MobileConfig
config
;
config
.
set_model_dir
(
model_dir
);
config
.
set_power_mode
(
power_mode
);
config
.
set_threads
(
thread_num
);
auto
predictor
=
lite_api
::
CreatePaddlePredictor
(
config
);
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
std
::
thread
pre_th0
(
run_with_predictor
,
predictor
,
input_shapes
,
i
,
model_dir
);
pre_th0
.
join
();
}
}
void
RunTestType_11
(
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes
,
const
std
::
string
&
model_dir
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>&
input_shapes_0
,
const
std
::
string
&
model_dir_0
,
const
PowerMode
power_mode
,
const
int
thread_num
,
const
int
repeat
,
int
warmup
=
5
)
{
lite_api
::
MobileConfig
config
;
config
.
set_model_dir
(
model_dir
);
config
.
set_power_mode
(
power_mode
);
config
.
set_threads
(
thread_num
);
auto
predictor
=
lite_api
::
CreatePaddlePredictor
(
config
);
config
.
set_model_dir
(
model_dir_0
);
auto
predictor_0
=
lite_api
::
CreatePaddlePredictor
(
config
);
for
(
int
i
=
0
;
i
<
2
*
repeat
;
i
+=
2
)
{
std
::
thread
pre_th0
(
run_with_predictor
,
predictor
,
input_shapes
,
i
,
model_dir
);
std
::
thread
pre_th1
(
run_with_predictor
,
predictor_0
,
input_shapes_0
,
i
+
1
,
model_dir_0
);
pre_th0
.
join
();
pre_th1
.
join
();
}
}
#endif
}
// namespace lite_api
}
// namespace paddle
int
main
(
int
argc
,
char
**
argv
)
{
gflags
::
ParseCommandLineFlags
(
&
argc
,
&
argv
,
true
);
if
(
FLAGS_model_dir
==
""
)
{
LOG
(
INFO
)
<<
"usage: "
<<
"--model_dir /path/to/your/model"
;
exit
(
0
);
}
std
::
string
save_optimized_model_dir
=
""
;
std
::
string
save_optimized_model_dir_0
=
""
;
if
(
FLAGS_use_optimize_nb
)
{
save_optimized_model_dir
=
FLAGS_model_dir
;
save_optimized_model_dir_0
=
FLAGS_model_dir_0
;
}
else
{
save_optimized_model_dir
=
FLAGS_model_dir
+
"opt2"
;
save_optimized_model_dir_0
=
FLAGS_model_dir_0
+
"opt2"
;
}
auto
split_string
=
[](
const
std
::
string
&
str_in
)
->
std
::
vector
<
std
::
string
>
{
std
::
vector
<
std
::
string
>
str_out
;
std
::
string
tmp_str
=
str_in
;
while
(
!
tmp_str
.
empty
())
{
size_t
next_offset
=
tmp_str
.
find
(
":"
);
str_out
.
push_back
(
tmp_str
.
substr
(
0
,
next_offset
));
if
(
next_offset
==
std
::
string
::
npos
)
{
break
;
}
else
{
tmp_str
=
tmp_str
.
substr
(
next_offset
+
1
);
}
}
return
str_out
;
};
auto
get_shape
=
[](
const
std
::
string
&
str_shape
)
->
std
::
vector
<
int64_t
>
{
std
::
vector
<
int64_t
>
shape
;
std
::
string
tmp_str
=
str_shape
;
while
(
!
tmp_str
.
empty
())
{
int
dim
=
atoi
(
tmp_str
.
data
());
shape
.
push_back
(
dim
);
size_t
next_offset
=
tmp_str
.
find
(
","
);
if
(
next_offset
==
std
::
string
::
npos
)
{
break
;
}
else
{
tmp_str
=
tmp_str
.
substr
(
next_offset
+
1
);
}
}
return
shape
;
};
std
::
vector
<
std
::
string
>
str_input_shapes
=
split_string
(
FLAGS_input_shape
);
std
::
vector
<
std
::
vector
<
int64_t
>>
input_shapes
;
for
(
int
i
=
0
;
i
<
str_input_shapes
.
size
();
++
i
)
{
input_shapes
.
push_back
(
get_shape
(
str_input_shapes
[
i
]));
}
std
::
vector
<
std
::
string
>
str_input_shapes_0
=
split_string
(
FLAGS_input_shape_0
);
std
::
vector
<
std
::
vector
<
int64_t
>>
input_shapes_0
;
for
(
int
i
=
0
;
i
<
str_input_shapes_0
.
size
();
++
i
)
{
input_shapes_0
.
push_back
(
get_shape
(
str_input_shapes_0
[
i
]));
}
if
(
!
FLAGS_use_optimize_nb
)
{
// Output optimized model
paddle
::
lite_api
::
OutputOptModel
(
FLAGS_model_dir
,
save_optimized_model_dir
,
input_shapes
);
paddle
::
lite_api
::
OutputOptModel
(
FLAGS_model_dir_0
,
save_optimized_model_dir_0
,
input_shapes_0
);
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
if
(
FLAGS_test_type
==
0
)
{
paddle
::
lite_api
::
RunTestType_00
(
input_shapes
,
save_optimized_model_dir
,
static_cast
<
paddle
::
lite_api
::
PowerMode
>
(
0
),
FLAGS_threads
,
FLAGS_repeats
,
5
);
LOG
(
INFO
)
<<
"=========above is case 0, below is case "
"1============================"
;
paddle
::
lite_api
::
RunTestType_10
(
input_shapes
,
save_optimized_model_dir
,
static_cast
<
paddle
::
lite_api
::
PowerMode
>
(
0
),
FLAGS_threads
,
FLAGS_repeats
);
}
if
(
FLAGS_test_type
==
1
)
{
paddle
::
lite_api
::
RunTestType_01
(
input_shapes
,
save_optimized_model_dir
,
input_shapes_0
,
save_optimized_model_dir_0
,
static_cast
<
paddle
::
lite_api
::
PowerMode
>
(
0
),
FLAGS_threads
,
FLAGS_repeats
,
5
);
LOG
(
INFO
)
<<
"=========above is case 0, below is case "
"1============================"
;
paddle
::
lite_api
::
RunTestType_11
(
input_shapes
,
save_optimized_model_dir
,
input_shapes_0
,
save_optimized_model_dir_0
,
static_cast
<
paddle
::
lite_api
::
PowerMode
>
(
0
),
FLAGS_threads
,
FLAGS_repeats
);
}
#endif
return
0
;
}
lite/backends/x86/cpu_info.cc
浏览文件 @
6554854a
...
...
@@ -32,26 +32,37 @@
#include <gflags/gflags.h>
#include <algorithm>
DEFINE_double
(
fraction_of_cpu_memory_to_use
,
1
,
"Default use 100% of CPU memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
DEFINE_uint64
(
initial_cpu_memory_in_mb
,
500ul
,
"Initial CPU memory for PaddlePaddle, in MD unit."
);
DEFINE_double
(
fraction_of_cuda_pinned_memory_to_use
,
0.5
,
"Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
"reserve the rest for page tables, etc"
);
#include "lite/utils/env.h"
// DEFINE_double(fraction_of_cpu_memory_to_use,
// 1,
// "Default use 100% of CPU memory for PaddlePaddle,"
// "reserve the rest for page tables, etc");
double
fraction_of_cpu_memory_to_use
=
paddle
::
lite
::
GetDoubleFromEnv
(
"fraction_of_cpu_memory_to_use"
,
1
);
// DEFINE_uint64(initial_cpu_memory_in_mb,
// 500ul,
// "Initial CPU memory for PaddlePaddle, in MD unit.");
uint64_t
initial_cpu_memory_in_mb
=
paddle
::
lite
::
GetUInt64FromEnv
(
"initial_cpu_memory_in_mb"
,
500ul
);
// DEFINE_double(
// fraction_of_cuda_pinned_memory_to_use,
// 0.5,
// "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
// "reserve the rest for page tables, etc");
double
fraction_of_cuda_pinned_memory_to_use
=
paddle
::
lite
::
GetDoubleFromEnv
(
"fraction_of_cuda_pinned_memory_to_use"
,
0.5
);
// If use_pinned_memory is true, CPUAllocator calls mlock, which
// returns pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the amount
// of memory available to the system for paging. So, by default, we
// should set false to use_pinned_memory.
DEFINE_bool
(
use_pinned_memory
,
true
,
"If set, allocate cpu pinned memory."
);
// DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
bool
use_pinned_memory
=
paddle
::
lite
::
GetBoolFromEnv
(
"use_pinned_memory"
,
true
);
namespace
paddle
{
namespace
lite
{
...
...
@@ -81,7 +92,7 @@ size_t CpuTotalPhysicalMemory() {
size_t
CpuMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return
FLAGS_
fraction_of_cpu_memory_to_use
*
CpuTotalPhysicalMemory
();
return
fraction_of_cpu_memory_to_use
*
CpuTotalPhysicalMemory
();
}
size_t
CpuMinChunkSize
()
{
...
...
@@ -92,15 +103,14 @@ size_t CpuMinChunkSize() {
size_t
CpuMaxChunkSize
()
{
// Allow to allocate the maximum chunk size is roughly 3% of CPU memory,
// or the initial_cpu_memory_in_mb.
return
std
::
min
(
static_cast
<
size_t
>
(
CpuMaxAllocSize
()
/
32
),
static_cast
<
size_t
>
(
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
));
return
std
::
min
(
static_cast
<
size_t
>
(
CpuMaxAllocSize
()
/
32
),
static_cast
<
size_t
>
(
initial_cpu_memory_in_mb
*
1
<<
20
));
}
size_t
CUDAPinnedMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
return
FLAGS_
fraction_of_cuda_pinned_memory_to_use
*
CpuTotalPhysicalMemory
();
return
fraction_of_cuda_pinned_memory_to_use
*
CpuTotalPhysicalMemory
();
}
size_t
CUDAPinnedMinChunkSize
()
{
...
...
lite/backends/x86/dynamic_loader.cc
浏览文件 @
6554854a
...
...
@@ -22,36 +22,46 @@ limitations under the License. */
#include "lite/backends/x86/cupti_lib_path.h"
#include "lite/backends/x86/port.h"
#include "lite/backends/x86/warpctc_lib_path.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
DEFINE_string
(
cudnn_dir
,
""
,
"Specify path for loading libcudnn.so. For instance, "
"/usr/local/cudnn/lib. If empty [default], dlopen "
"will search cudnn from LD_LIBRARY_PATH"
);
// DEFINE_string(cudnn_dir,
// "",
// "Specify path for loading libcudnn.so. For instance, "
// "/usr/local/cudnn/lib. If empty [default], dlopen "
// "will search cudnn from LD_LIBRARY_PATH");
std
::
string
cudnn_dir
=
paddle
::
lite
::
GetStringFromEnv
(
"cudnn_dir"
);
// NOLINT
DEFINE_string
(
cuda_dir
,
""
,
"Specify path for loading cuda library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH"
);
// DEFINE_string(cuda_dir,
// "",
// "Specify path for loading cuda library, such as libcublas, "
// "libcurand. For instance, /usr/local/cuda/lib64. If default, "
// "dlopen will search cuda from LD_LIBRARY_PATH");
std
::
string
cuda_dir
=
paddle
::
lite
::
GetStringFromEnv
(
"cuda_dir"
);
// NOLINT
DEFINE_string
(
warpctc_dir
,
""
,
"Specify path for loading libwarpctc.so."
);
// DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
std
::
string
f_warpctc_dir
=
// NOLINT
paddle
::
lite
::
GetStringFromEnv
(
"warpctc_dir"
);
// NOLINT
DEFINE_string
(
nccl_dir
,
""
,
"Specify path for loading nccl library, such as libcublas, "
"libcurand. For instance, /usr/local/cuda/lib64. If default, "
"dlopen will search cuda from LD_LIBRARY_PATH"
);
// DEFINE_string(nccl_dir,
// "",
// "Specify path for loading nccl library, such as libcublas, "
// "libcurand. For instance, /usr/local/cuda/lib64. If default, "
// "dlopen will search cuda from LD_LIBRARY_PATH");
std
::
string
nccl_dir
=
paddle
::
lite
::
GetStringFromEnv
(
"nccl_dir"
);
// NOLINT
DEFINE_string
(
cupti_dir
,
""
,
"Specify path for loading cupti.so."
);
// DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
std
::
string
cupti_dir
=
paddle
::
lite
::
GetStringFromEnv
(
"cupti_dir"
);
// NOLINT
DEFINE_string
(
tensorrt_dir
,
""
,
"Specify path for loading tensorrt library, such as libnvinfer.so."
);
// DEFINE_string(
// tensorrt_dir,
// "",
// "Specify path for loading tensorrt library, such as libnvinfer.so.");
std
::
string
tensorrt_dir
=
// NOLINT
paddle
::
lite
::
GetStringFromEnv
(
"tensorrt_dir"
);
// NOLINT
DEFINE_string
(
mklml_dir
,
""
,
"Specify path for loading libmklml_intel.so."
);
// DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so.");
std
::
string
mklml_dir
=
paddle
::
lite
::
GetStringFromEnv
(
"mklml_dir"
);
// NOLINT
namespace
paddle
{
namespace
lite
{
...
...
@@ -180,28 +190,28 @@ auto error_msg =
void
*
GetCublasDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_
cuda_dir
,
"libcublas.dylib"
);
return
GetDsoHandleFromSearchPath
(
cuda_dir
,
"libcublas.dylib"
);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return
GetDsoHandleFromSearchPath
(
FLAGS_
cuda_dir
,
win_cublas_lib
);
return
GetDsoHandleFromSearchPath
(
cuda_dir
,
win_cublas_lib
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_
cuda_dir
,
"libcublas.so"
);
return
GetDsoHandleFromSearchPath
(
cuda_dir
,
"libcublas.so"
);
#endif
}
void
*
GetCUDNNDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_
cudnn_dir
,
"libcudnn.dylib"
,
false
);
return
GetDsoHandleFromSearchPath
(
cudnn_dir
,
"libcudnn.dylib"
,
false
);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return
GetDsoHandleFromSearchPath
(
FLAGS_
cudnn_dir
,
win_cudnn_lib
);
return
GetDsoHandleFromSearchPath
(
cudnn_dir
,
win_cudnn_lib
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_
cudnn_dir
,
"libcudnn.so"
,
false
);
return
GetDsoHandleFromSearchPath
(
cudnn_dir
,
"libcudnn.so"
,
false
);
#endif
}
void
*
GetCUPTIDsoHandle
()
{
std
::
string
cupti_path
=
cupti_lib_path
;
if
(
!
FLAGS_
cupti_dir
.
empty
())
{
cupti_path
=
FLAGS_
cupti_dir
;
if
(
!
cupti_dir
.
empty
())
{
cupti_path
=
cupti_dir
;
}
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
cupti_path
,
"libcupti.dylib"
,
false
);
...
...
@@ -212,18 +222,18 @@ void* GetCUPTIDsoHandle() {
void
*
GetCurandDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_
cuda_dir
,
"libcurand.dylib"
);
return
GetDsoHandleFromSearchPath
(
cuda_dir
,
"libcurand.dylib"
);
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return
GetDsoHandleFromSearchPath
(
FLAGS_
cuda_dir
,
win_curand_lib
);
return
GetDsoHandleFromSearchPath
(
cuda_dir
,
win_curand_lib
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_
cuda_dir
,
"libcurand.so"
);
return
GetDsoHandleFromSearchPath
(
cuda_dir
,
"libcurand.so"
);
#endif
}
void
*
GetWarpCTCDsoHandle
()
{
std
::
string
warpctc_dir
=
warpctc_lib_path
;
if
(
!
FLAGS
_warpctc_dir
.
empty
())
{
warpctc_dir
=
FLAGS
_warpctc_dir
;
if
(
!
f
_warpctc_dir
.
empty
())
{
warpctc_dir
=
f
_warpctc_dir
;
}
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
warpctc_dir
,
"libwarpctc.dylib"
);
...
...
@@ -236,27 +246,27 @@ void* GetWarpCTCDsoHandle() {
void
*
GetNCCLDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_
nccl_dir
,
"libnccl.dylib"
);
return
GetDsoHandleFromSearchPath
(
nccl_dir
,
"libnccl.dylib"
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_
nccl_dir
,
"libnccl.so"
);
return
GetDsoHandleFromSearchPath
(
nccl_dir
,
"libnccl.so"
);
#endif
}
void
*
GetTensorRtDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_
tensorrt_dir
,
"libnvinfer.dylib"
);
return
GetDsoHandleFromSearchPath
(
tensorrt_dir
,
"libnvinfer.dylib"
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_
tensorrt_dir
,
"libnvinfer.so"
);
return
GetDsoHandleFromSearchPath
(
tensorrt_dir
,
"libnvinfer.so"
);
#endif
}
void
*
GetMKLMLDsoHandle
()
{
#if defined(__APPLE__) || defined(__OSX__)
return
GetDsoHandleFromSearchPath
(
FLAGS_
mklml_dir
,
"libmklml_intel.dylib"
);
return
GetDsoHandleFromSearchPath
(
mklml_dir
,
"libmklml_intel.dylib"
);
#elif defined(_WIN32)
return
GetDsoHandleFromSearchPath
(
FLAGS_
mklml_dir
,
"mklml.dll"
);
return
GetDsoHandleFromSearchPath
(
mklml_dir
,
"mklml.dll"
);
#else
return
GetDsoHandleFromSearchPath
(
FLAGS_
mklml_dir
,
"libmklml_intel.so"
);
return
GetDsoHandleFromSearchPath
(
mklml_dir
,
"libmklml_intel.so"
);
#endif
}
...
...
lite/backends/x86/jit/gen_base.cc
浏览文件 @
6554854a
...
...
@@ -21,13 +21,15 @@
// posix_memalign
#include "lite/backends/x86/cpu_info.h"
#include "lite/backends/x86/jit/macro.h"
#include "lite/utils/env.h"
#include "lite/utils/paddle_enforce.h"
#ifndef _WIN32
#define posix_memalign_free free
#endif
DEFINE_bool
(
dump_jitcode
,
false
,
"Whether to dump the jitcode to file"
);
// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
bool
dump_jitcode
=
paddle
::
lite
::
GetBoolFromEnv
(
"dump_jitcode"
);
namespace
paddle
{
namespace
lite
{
...
...
lite/backends/x86/jit/gen_base.h
浏览文件 @
6554854a
...
...
@@ -20,7 +20,8 @@
#include <vector>
#include "lite/backends/x86/jit/kernel_base.h"
DECLARE_bool
(
dump_jitcode
);
// DECLARE_bool(dump_jitcode);
extern
bool
dump_jitcode
;
namespace
paddle
{
namespace
lite
{
...
...
@@ -36,7 +37,7 @@ class GenBase : public Kernel {
template
<
typename
Func
>
Func
getCode
()
const
{
const
unsigned
char
*
code
=
this
->
getCodeInternal
();
if
(
FLAGS_
dump_jitcode
)
{
if
(
dump_jitcode
)
{
this
->
dumpCode
(
code
);
}
// Note: failed to cast with reinterpret_cast<const Func> on Mac clang,
...
...
lite/backends/x86/math/beam_search.cc
浏览文件 @
6554854a
...
...
@@ -86,7 +86,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
// selected_ids->mutable_data<int64_t>(dims, platform::CPUPlace());
// auto *selected_scores_data =
// selected_scores->mutable_data<float>(dims, platform::CPUPlace());
parent_idx
->
Resize
({
static_cast
<
int64_t
>
(
num_instances
)});
parent_idx
->
Resize
(
std
::
vector
<
int64_t
>
({
static_cast
<
int64_t
>
(
num_instances
)}));
auto
*
parent_idx_data
=
parent_idx
?
parent_idx
->
mutable_data
<
int
>
(
TARGET
(
kX86
))
:
nullptr
;
// auto *parent_idx_data =
...
...
lite/backends/x86/math/detail/avx_mathfun.h
浏览文件 @
6554854a
...
...
@@ -41,9 +41,11 @@
(this is the zlib license)
*/
#pragma once
#include "lite/backends/x86/cpu_info.h"
namespace
paddle
{
namespace
lite
{
/* __m128 is ugly to write */
typedef
__m256
v8sf
;
// vector of 8 float (avx)
typedef
__m256i
v8si
;
// vector of 8 int (avx)
...
...
@@ -134,7 +136,7 @@ typedef union imm_xmm_union {
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 bitshift ops"
//
#warning "Using SSE2 to perform AVX2 bitshift ops"
AVX2_BITOP_USING_SSE2
(
slli_epi32
)
AVX2_BITOP_USING_SSE2
(
srli_epi32
)
...
...
@@ -152,7 +154,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
return (ret); \
}
//#warning "Using SSE2 to perform AVX2 integer ops"
//
#warning "Using SSE2 to perform AVX2 integer ops"
AVX2_INTOP_USING_SSE2
(
and_si128
)
AVX2_INTOP_USING_SSE2
(
andnot_si128
)
AVX2_INTOP_USING_SSE2
(
cmpeq_epi32
)
...
...
@@ -175,23 +177,23 @@ AVX2_INTOP_USING_SSE2(add_epi32)
*/
v8sf
log256_ps
(
v8sf
x
)
{
v8si
imm0
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
// NOLINT
// v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
v8sf
invalid_mask
=
_mm256_cmp_ps
(
x
,
_mm256_setzero_ps
(),
_CMP_LE_OS
);
x
=
_mm256_max_ps
(
x
,
*
(
v8sf
*
)
_ps256_min_norm_pos
);
/* cut off denormalized stuff */
x
=
_mm256_max_ps
(
x
,
*
(
v8sf
*
)
_ps256_min_norm_pos
);
// NOLINT
/* cut off denormalized stuff */
// NOLINT
// can be done with AVX2
imm0
=
avx2_mm256_srli_epi32
(
_mm256_castps_si256
(
x
),
23
);
/* keep only the fractional part */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_mant_mask
);
x
=
_mm256_or_ps
(
x
,
*
(
v8sf
*
)
_ps256_0p5
);
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_mant_mask
);
// NOLINT
x
=
_mm256_or_ps
(
x
,
*
(
v8sf
*
)
_ps256_0p5
);
// NOLINT
// this is again another AVX2 instruction
imm0
=
avx2_mm256_sub_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
imm0
=
avx2_mm256_sub_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
// NOLINT
v8sf
e
=
_mm256_cvtepi32_ps
(
imm0
);
e
=
_mm256_add_ps
(
e
,
one
);
...
...
@@ -203,7 +205,8 @@ v8sf log256_ps(v8sf x) {
} else { x = x - 1.0; }
*/
// v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
v8sf
mask
=
_mm256_cmp_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_SQRTHF
,
_CMP_LT_OS
);
v8sf
mask
=
_mm256_cmp_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_SQRTHF
,
_CMP_LT_OS
);
// NOLINT
v8sf
tmp
=
_mm256_and_ps
(
x
,
mask
);
x
=
_mm256_sub_ps
(
x
,
one
);
e
=
_mm256_sub_ps
(
e
,
_mm256_and_ps
(
one
,
mask
));
...
...
@@ -211,34 +214,34 @@ v8sf log256_ps(v8sf x) {
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_log_p0
;
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_log_p0
;
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p1
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p2
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p2
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p3
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p3
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p4
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p4
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p5
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p5
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p6
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p6
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p7
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p7
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p8
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_log_p8
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q1
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q1
);
// NOLINT
y
=
_mm256_add_ps
(
y
,
tmp
);
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
// NOLINT
y
=
_mm256_sub_ps
(
y
,
tmp
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q2
);
tmp
=
_mm256_mul_ps
(
e
,
*
(
v8sf
*
)
_ps256_cephes_log_q2
);
// NOLINT
x
=
_mm256_add_ps
(
x
,
y
);
x
=
_mm256_add_ps
(
x
,
tmp
);
x
=
_mm256_or_ps
(
x
,
invalid_mask
);
// negative arg will be NAN
...
...
@@ -262,14 +265,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
v8sf
exp256_ps
(
v8sf
x
)
{
v8sf
tmp
=
_mm256_setzero_ps
(),
fx
;
v8si
imm0
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
v8sf
one
=
*
(
v8sf
*
)
_ps256_1
;
// NOLINT
x
=
_mm256_min_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_hi
);
x
=
_mm256_max_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_lo
);
x
=
_mm256_min_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_hi
);
// NOLINT
x
=
_mm256_max_ps
(
x
,
*
(
v8sf
*
)
_ps256_exp_lo
);
// NOLINT
/* express exp(x) as exp(g + n*log(2)) */
fx
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_LOG2EF
);
fx
=
_mm256_add_ps
(
fx
,
*
(
v8sf
*
)
_ps256_0p5
);
fx
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_LOG2EF
);
// NOLINT
fx
=
_mm256_add_ps
(
fx
,
*
(
v8sf
*
)
_ps256_0p5
);
// NOLINT
/* how to perform a floorf with SSE: just below */
// imm0 = _mm256_cvttps_epi32(fx);
...
...
@@ -283,24 +286,24 @@ v8sf exp256_ps(v8sf x) {
mask
=
_mm256_and_ps
(
mask
,
one
);
fx
=
_mm256_sub_ps
(
tmp
,
mask
);
tmp
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C1
);
v8sf
z
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C2
);
tmp
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C1
);
// NOLINT
v8sf
z
=
_mm256_mul_ps
(
fx
,
*
(
v8sf
*
)
_ps256_cephes_exp_C2
);
// NOLINT
x
=
_mm256_sub_ps
(
x
,
tmp
);
x
=
_mm256_sub_ps
(
x
,
z
);
z
=
_mm256_mul_ps
(
x
,
x
);
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_exp_p0
;
v8sf
y
=
*
(
v8sf
*
)
_ps256_cephes_exp_p0
;
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p1
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p2
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p2
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p3
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p3
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p4
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p4
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p5
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_cephes_exp_p5
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
x
);
y
=
_mm256_add_ps
(
y
,
one
);
...
...
@@ -308,7 +311,7 @@ v8sf exp256_ps(v8sf x) {
/* build 2^n */
imm0
=
_mm256_cvttps_epi32
(
fx
);
// another two AVX2 instructions
imm0
=
avx2_mm256_add_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
imm0
=
avx2_mm256_add_epi32
(
imm0
,
*
(
v8si
*
)
_pi32_256_0x7f
);
// NOLINT
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
23
);
v8sf
pow2n
=
_mm256_castsi256_ps
(
imm0
);
y
=
_mm256_mul_ps
(
y
,
pow2n
);
...
...
@@ -349,12 +352,12 @@ v8sf sin256_ps(v8sf x) { // any x
sign_bit
=
x
;
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
// NOLINT
/* extract the sign bit (upper one) */
sign_bit
=
_mm256_and_ps
(
sign_bit
,
*
(
v8sf
*
)
_ps256_sign_mask
);
sign_bit
=
_mm256_and_ps
(
sign_bit
,
*
(
v8sf
*
)
_ps256_sign_mask
);
// NOLINT
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
// NOLINT
/*
Here we start a series of integer operations, which are in the
...
...
@@ -367,12 +370,12 @@ v8sf sin256_ps(v8sf x) { // any x
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
// another two AVX2 instruction
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
// NOLINT
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
// NOLINT
y
=
_mm256_cvtepi32_ps
(
imm2
);
/* get the swap sign flag */
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
// NOLINT
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
...
...
@@ -380,31 +383,31 @@ v8sf sin256_ps(v8sf x) { // any x
Both branches will be computed.
*/
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
// NOLINT
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
// NOLINT
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
// NOLINT
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
// NOLINT
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
// NOLINT
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
// NOLINT
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
...
@@ -418,9 +421,9 @@ v8sf sin256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
// NOLINT
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
// NOLINT
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
// NOLINT
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
...
@@ -429,26 +432,26 @@ v8sf sin256_ps(v8sf x) { // any x
x
=
_mm256_add_ps
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
// NOLINT
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
// NOLINT
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
// NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
...
@@ -475,53 +478,53 @@ v8sf cos256_ps(v8sf x) { // any x
#endif
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
// NOLINT
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
// NOLINT
#ifdef __AVX2__
/* store the integer part of y in mm0 */
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
// NOLINT
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
// NOLINT
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm2
=
avx2_mm256_sub_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_sub_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
// NOLINT
/* get the swap sign flag */
imm0
=
avx2_mm256_andnot_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_andnot_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
// NOLINT
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
/* get the polynom selection mask */
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
// NOLINT
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
// NOLINT
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
// NOLINT
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
// NOLINT
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
// NOLINT
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
// NOLINT
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm2_1
=
_mm_sub_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_sub_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_sub_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm2_2
=
_mm_sub_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm0_1
=
_mm_andnot_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_andnot_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_andnot_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm0_2
=
_mm_andnot_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
...
@@ -534,9 +537,9 @@ v8sf cos256_ps(v8sf x) { // any x
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
// NOLINT
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
// NOLINT
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
// NOLINT
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
...
@@ -545,26 +548,26 @@ v8sf cos256_ps(v8sf x) { // any x
x
=
_mm256_add_ps
(
x
,
xmm3
);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
// NOLINT
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
// NOLINT
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
// NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
...
@@ -595,42 +598,43 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
sign_bit_sin
=
x
;
/* take the absolute value */
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
x
=
_mm256_and_ps
(
x
,
*
(
v8sf
*
)
_ps256_inv_sign_mask
);
// NOLINT
/* extract the sign bit (upper one) */
sign_bit_sin
=
_mm256_and_ps
(
sign_bit_sin
,
*
(
v8sf
*
)
_ps256_sign_mask
);
sign_bit_sin
=
_mm256_and_ps
(
sign_bit_sin
,
*
(
v8sf
*
)
_ps256_sign_mask
);
// NOLINT
/* scale by 4/Pi */
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
y
=
_mm256_mul_ps
(
x
,
*
(
v8sf
*
)
_ps256_cephes_FOPI
);
// NOLINT
#ifdef __AVX2__
/* store the integer part of y in imm2 */
imm2
=
_mm256_cvttps_epi32
(
y
);
/* j=(j+1) & (~1) (see the cephes sources) */
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
imm2
=
avx2_mm256_add_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_1
);
// NOLINT
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_inv1
);
// NOLINT
y
=
_mm256_cvtepi32_ps
(
imm2
);
imm4
=
imm2
;
/* get the swap sign flag for the sine */
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
imm0
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_4
);
// NOLINT
imm0
=
avx2_mm256_slli_epi32
(
imm0
,
29
);
// v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
/* get the polynom selection mask for the sine*/
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
imm2
=
avx2_mm256_and_si256
(
imm2
,
*
(
v8si
*
)
_pi32_256_2
);
// NOLINT
imm2
=
avx2_mm256_cmpeq_epi32
(
imm2
,
*
(
v8si
*
)
_pi32_256_0
);
// NOLINT
// v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
/* we use SSE2 routines to perform the integer ops */
COPY_IMM_TO_XMM
(
_mm256_cvttps_epi32
(
y
),
imm2_1
,
imm2_2
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
imm2_1
=
_mm_add_epi32
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_1
);
// NOLINT
imm2_2
=
_mm_add_epi32
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_1
);
// NOLINT
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_inv1
);
// NOLINT
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_inv1
);
// NOLINT
COPY_XMM_TO_IMM
(
imm2_1
,
imm2_2
,
imm2
);
y
=
_mm256_cvtepi32_ps
(
imm2
);
...
...
@@ -638,16 +642,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
imm4_1
=
imm2_1
;
imm4_2
=
imm2_2
;
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm0_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm0_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm0_1
=
_mm_slli_epi32
(
imm0_1
,
29
);
imm0_2
=
_mm_slli_epi32
(
imm0_2
,
29
);
COPY_XMM_TO_IMM
(
imm0_1
,
imm0_2
,
imm0
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm2_1
=
_mm_and_si128
(
imm2_1
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm2_2
=
_mm_and_si128
(
imm2_2
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm2_1
=
_mm_cmpeq_epi32
(
imm2_1
,
_mm_setzero_si128
());
imm2_2
=
_mm_cmpeq_epi32
(
imm2_2
,
_mm_setzero_si128
());
...
...
@@ -659,9 +663,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
xmm1
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP1
;
// NOLINT
xmm2
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP2
;
// NOLINT
xmm3
=
*
(
v8sf
*
)
_ps256_minus_cephes_DP3
;
// NOLINT
xmm1
=
_mm256_mul_ps
(
y
,
xmm1
);
xmm2
=
_mm256_mul_ps
(
y
,
xmm2
);
xmm3
=
_mm256_mul_ps
(
y
,
xmm3
);
...
...
@@ -670,15 +674,15 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
x
=
_mm256_add_ps
(
x
,
xmm3
);
#ifdef __AVX2__
imm4
=
avx2_mm256_sub_epi32
(
imm4
,
*
(
v8si
*
)
_pi32_256_2
);
imm4
=
avx2_mm256_andnot_si256
(
imm4
,
*
(
v8si
*
)
_pi32_256_4
);
imm4
=
avx2_mm256_sub_epi32
(
imm4
,
*
(
v8si
*
)
_pi32_256_2
);
// NOLINT
imm4
=
avx2_mm256_andnot_si256
(
imm4
,
*
(
v8si
*
)
_pi32_256_4
);
// NOLINT
imm4
=
avx2_mm256_slli_epi32
(
imm4
,
29
);
#else
imm4_1
=
_mm_sub_epi32
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_2
);
imm4_2
=
_mm_sub_epi32
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_2
);
imm4_1
=
_mm_sub_epi32
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm4_2
=
_mm_sub_epi32
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_2
);
// NOLINT
imm4_1
=
_mm_andnot_si128
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_4
);
imm4_2
=
_mm_andnot_si128
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_4
);
imm4_1
=
_mm_andnot_si128
(
imm4_1
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm4_2
=
_mm_andnot_si128
(
imm4_2
,
*
(
v4si
*
)
_pi32avx_4
);
// NOLINT
imm4_1
=
_mm_slli_epi32
(
imm4_1
,
29
);
imm4_2
=
_mm_slli_epi32
(
imm4_2
,
29
);
...
...
@@ -692,25 +696,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
/* Evaluate the first polynom (0 <= x <= Pi/4) */
v8sf
z
=
_mm256_mul_ps
(
x
,
x
);
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
y
=
*
(
v8sf
*
)
_ps256_coscof_p0
;
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p1
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_coscof_p2
);
// NOLINT
y
=
_mm256_mul_ps
(
y
,
z
);
y
=
_mm256_mul_ps
(
y
,
z
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
v8sf
tmp
=
_mm256_mul_ps
(
z
,
*
(
v8sf
*
)
_ps256_0p5
);
// NOLINT
y
=
_mm256_sub_ps
(
y
,
tmp
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
y
=
_mm256_add_ps
(
y
,
*
(
v8sf
*
)
_ps256_1
);
// NOLINT
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
v8sf
y2
=
*
(
v8sf
*
)
_ps256_sincof_p0
;
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p1
);
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
y2
=
_mm256_add_ps
(
y2
,
*
(
v8sf
*
)
_ps256_sincof_p2
);
// NOLINT
y2
=
_mm256_mul_ps
(
y2
,
z
);
y2
=
_mm256_mul_ps
(
y2
,
x
);
y2
=
_mm256_add_ps
(
y2
,
x
);
...
...
@@ -729,3 +733,6 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
*
s
=
_mm256_xor_ps
(
xmm1
,
sign_bit_sin
);
*
c
=
_mm256_xor_ps
(
xmm2
,
sign_bit_cos
);
}
}
// namespace lite
}
// namespace paddle
lite/core/kernel.h
浏览文件 @
6554854a
...
...
@@ -83,14 +83,11 @@ class KernelBase {
#if defined(LITE_WITH_CUDA)
WorkSpace
::
Global_CUDA
().
AllocReset
();
#endif
#ifdef LITE_WITH_PROFILE
CHECK
(
profiler_
)
<<
"Profiler pointer of kernel can not be nullptr. "
"When LITE_WITH_PROFILE is defined, please set a "
"Profiler for Instruction."
;
profiler_
->
StartTiming
(
profile_id_
,
ctx_
.
get
());
profiler_
->
StopTiming
(
profile
::
Type
::
kCreate
,
profile_id_
,
ctx_
.
get
());
profiler_
->
StartTiming
(
profile
::
Type
::
kDispatch
,
profile_id_
,
ctx_
.
get
());
Run
();
profiler_
->
StopTiming
(
profile_id_
,
ctx_
.
get
());
profiler_
->
StopTiming
(
profile
::
Type
::
kDispatch
,
profile
_id_
,
ctx_
.
get
());
#else
Run
();
#endif
...
...
lite/core/memory.h
浏览文件 @
6554854a
...
...
@@ -120,6 +120,7 @@ class Buffer {
if
(
space_
>
0
)
{
TargetFree
(
target_
,
data_
);
}
data_
=
nullptr
;
target_
=
TargetType
::
kHost
;
space_
=
0
;
}
...
...
lite/core/profile/profiler.cc
浏览文件 @
6554854a
...
...
@@ -28,36 +28,55 @@ auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
};
}
int
Profiler
::
NewTimer
(
const
OpCharacter
&
ch
)
{
StatisUnit
unit
;
unit
.
character
=
ch
;
std
::
map
<
Type
,
std
::
string
>
TypeStr
{
{
Type
::
kUnk
,
"Unknown"
},
{
Type
::
kCreate
,
"Create"
},
{
Type
::
kDispatch
,
"Dispatch"
},
};
StatisUnit
::
StatisUnit
(
const
OpCharacter
&
ch
)
:
character
(
ch
)
{
create_t
.
reset
(
new
DeviceTimer
<
TargetType
::
kHost
>
());
if
(
ch
.
target
==
TargetType
::
kCUDA
)
{
#ifdef LITE_WITH_CUDA
unit
.
timer
.
reset
(
new
DeviceTimer
<
TargetType
::
kCUDA
>
());
dispatch_t
.
reset
(
new
DeviceTimer
<
TargetType
::
kCUDA
>
());
#else
LOG
(
ERROR
)
<<
"The timer type specified as cuda is uninitialized, so the "
"default x86 timer is used instead."
;
#endif
}
else
{
unit
.
timer
.
reset
(
new
DeviceTimer
<
TargetType
::
kHost
>
());
dispatch_t
.
reset
(
new
DeviceTimer
<
TargetType
::
kHost
>
());
}
}
lite
::
profile
::
Timer
*
StatisUnit
::
Timer
(
Type
type
)
{
if
(
type
==
Type
::
kCreate
)
{
return
create_t
.
get
();
}
else
if
(
type
==
Type
::
kDispatch
)
{
return
dispatch_t
.
get
();
}
LOG
(
FATAL
)
<<
"Timer cannot be returned for unknown platforms."
;
return
nullptr
;
}
int
Profiler
::
NewTimer
(
const
OpCharacter
&
ch
)
{
StatisUnit
unit
(
ch
);
units_
.
push_back
(
std
::
move
(
unit
));
return
units_
.
size
()
-
1
;
}
void
Profiler
::
StartTiming
(
const
int
index
,
KernelContext
*
ctx
)
{
void
Profiler
::
StartTiming
(
Type
type
,
const
int
index
,
KernelContext
*
ctx
)
{
CHECK_LT
(
index
,
units_
.
size
())
<<
"The timer index in the profiler is out of range."
;
units_
[
index
].
timer
->
Start
(
ctx
);
units_
[
index
].
Timer
(
type
)
->
Start
(
ctx
);
}
float
Profiler
::
StopTiming
(
const
int
index
,
KernelContext
*
ctx
)
{
float
Profiler
::
StopTiming
(
Type
type
,
const
int
index
,
KernelContext
*
ctx
)
{
CHECK_LT
(
index
,
units_
.
size
())
<<
"The timer index in the profiler is out of range."
;
return
units_
[
index
].
timer
->
Stop
(
ctx
);
return
units_
[
index
].
Timer
(
type
)
->
Stop
(
ctx
);
}
std
::
string
Profiler
::
Summary
(
bool
concise
,
size_t
w
)
{
std
::
string
Profiler
::
Summary
(
Type
type
,
bool
concise
,
size_t
w
)
{
using
std
::
setw
;
using
std
::
left
;
using
std
::
fixed
;
...
...
@@ -65,12 +84,14 @@ std::string Profiler::Summary(bool concise, size_t w) {
std
::
string
title
;
// Title.
if
(
concise
)
{
ss
<<
"Timing cycle = "
<<
units_
.
front
().
timer
->
LapTimes
().
Size
()
ss
<<
"Timing cycle = "
<<
units_
.
front
().
Timer
(
type
)
->
LapTimes
().
Size
()
<<
std
::
endl
;
ss
<<
"===== Concise Profiler Summary: "
<<
name_
<<
", Exclude "
<<
w
ss
<<
"===== Concise "
<<
TypeStr
.
find
(
type
)
->
second
<<
" Profiler Summary: "
<<
name_
<<
", Exclude "
<<
w
<<
" warm-ups ====="
<<
std
::
endl
;
}
else
{
ss
<<
"===== Detailed Profiler Summary: "
<<
name_
<<
", Exclude "
<<
w
ss
<<
"===== Detailed "
<<
TypeStr
.
find
(
type
)
->
second
<<
" Profiler Summary: "
<<
name_
<<
", Exclude "
<<
w
<<
" warm-ups ====="
<<
std
::
endl
;
}
ss
<<
setw
(
25
)
<<
left
<<
"Operator Type"
...
...
@@ -84,16 +105,16 @@ std::string Profiler::Summary(bool concise, size_t w) {
if
(
concise
)
{
std
::
map
<
OpCharacter
,
TimeInfo
,
decltype
(
op_comp
)
>
summary
(
op_comp
);
for
(
auto
&
unit
:
units_
)
{
auto
ch
=
summary
.
find
(
unit
.
character
);
auto
ch
=
summary
.
find
(
unit
.
Character
()
);
if
(
ch
!=
summary
.
end
())
{
ch
->
second
.
avg
+=
unit
.
timer
->
LapTimes
().
Avg
(
w
);
ch
->
second
.
min
+=
unit
.
timer
->
LapTimes
().
Min
(
w
);
ch
->
second
.
max
+=
unit
.
timer
->
LapTimes
().
Max
(
w
);
ch
->
second
.
avg
+=
unit
.
Timer
(
type
)
->
LapTimes
().
Avg
(
w
);
ch
->
second
.
min
+=
unit
.
Timer
(
type
)
->
LapTimes
().
Min
(
w
);
ch
->
second
.
max
+=
unit
.
Timer
(
type
)
->
LapTimes
().
Max
(
w
);
}
else
{
TimeInfo
info
({
unit
.
timer
->
LapTimes
().
Avg
(
w
),
unit
.
timer
->
LapTimes
().
Min
(
w
),
unit
.
timer
->
LapTimes
().
Max
(
w
)});
summary
.
insert
({
unit
.
character
,
info
});
TimeInfo
info
({
unit
.
Timer
(
type
)
->
LapTimes
().
Avg
(
w
),
unit
.
Timer
(
type
)
->
LapTimes
().
Min
(
w
),
unit
.
Timer
(
type
)
->
LapTimes
().
Max
(
w
)});
summary
.
insert
({
unit
.
Character
()
,
info
});
}
}
for
(
const
auto
&
item
:
summary
)
{
...
...
@@ -109,14 +130,15 @@ std::string Profiler::Summary(bool concise, size_t w) {
}
}
else
{
for
(
auto
&
unit
:
units_
)
{
const
auto
&
times
=
unit
.
Timer
(
type
)
->
LapTimes
();
// clang-format off
ss
<<
setw
(
25
)
<<
left
<<
fixed
<<
unit
.
character
.
op_type
\
<<
" "
<<
setw
(
40
)
<<
left
<<
fixed
<<
unit
.
character
.
kernel_name
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
unit
.
character
.
remark
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
unit
.
timer
->
LapTimes
().
Avg
(
w
)
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
unit
.
timer
->
LapTimes
().
Min
(
w
)
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
unit
.
timer
->
LapTimes
().
Max
(
w
)
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
unit
.
timer
->
LapTimes
().
Last
(
w
)
\
ss
<<
setw
(
25
)
<<
left
<<
fixed
<<
unit
.
Character
().
op_type
\
<<
" "
<<
setw
(
40
)
<<
left
<<
fixed
<<
unit
.
Character
().
kernel_name
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
unit
.
Character
().
remark
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
times
.
Avg
(
w
)
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
times
.
Min
(
w
)
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
times
.
Max
(
w
)
\
<<
" "
<<
setw
(
12
)
<<
left
<<
fixed
<<
times
.
Last
(
w
)
\
<<
std
::
endl
;
// clang-format on
}
...
...
lite/core/profile/profiler.h
浏览文件 @
6554854a
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
...
...
@@ -22,6 +23,14 @@ namespace paddle {
namespace
lite
{
namespace
profile
{
enum
class
Type
{
kUnk
=
0
,
kCreate
,
kDispatch
,
};
extern
std
::
map
<
Type
,
std
::
string
>
TypeStr
;
struct
TimeInfo
{
float
avg
;
float
min
;
...
...
@@ -35,8 +44,15 @@ struct OpCharacter {
std
::
string
remark
{
std
::
string
(
"N/A"
)};
};
struct
StatisUnit
{
std
::
unique_ptr
<
Timer
>
timer
;
class
StatisUnit
final
{
public:
explicit
StatisUnit
(
const
OpCharacter
&
ch
);
lite
::
profile
::
Timer
*
Timer
(
Type
type
);
const
OpCharacter
&
Character
()
const
{
return
character
;
}
protected:
std
::
unique_ptr
<
lite
::
profile
::
Timer
>
create_t
;
std
::
unique_ptr
<
lite
::
profile
::
Timer
>
dispatch_t
;
OpCharacter
character
;
};
...
...
@@ -45,9 +61,9 @@ class Profiler final {
Profiler
()
=
default
;
explicit
Profiler
(
const
std
::
string
&
name
)
:
name_
(
name
)
{}
int
NewTimer
(
const
OpCharacter
&
ch
);
void
StartTiming
(
const
int
index
,
KernelContext
*
ctx
);
float
StopTiming
(
const
int
index
,
KernelContext
*
ctx
);
std
::
string
Summary
(
bool
concise
=
true
,
size_t
warm_up
=
10
);
void
StartTiming
(
Type
type
,
const
int
index
,
KernelContext
*
ctx
);
float
StopTiming
(
Type
type
,
const
int
index
,
KernelContext
*
ctx
);
std
::
string
Summary
(
Type
type
,
bool
concise
=
true
,
size_t
warm_up
=
10
);
private:
std
::
string
name_
{
std
::
string
(
"N/A"
)};
...
...
lite/core/profile/test_timer.cc
浏览文件 @
6554854a
...
...
@@ -69,10 +69,10 @@ TEST(profiler, real_latency) {
ch
.
op_type
=
"operator/1"
;
ch
.
kernel_name
=
"kernel/1"
;
int
idx
=
profiler
.
NewTimer
(
ch
);
profiler
.
StartTiming
(
idx
,
&
ctx
);
profiler
.
StartTiming
(
Type
::
kDispatch
,
idx
,
&
ctx
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
10
));
profiler
.
StopTiming
(
idx
,
&
ctx
);
std
::
cout
<<
profiler
.
Summary
();
profiler
.
StopTiming
(
Type
::
kDispatch
,
idx
,
&
ctx
);
std
::
cout
<<
profiler
.
Summary
(
Type
::
kDispatch
);
}
#endif
...
...
lite/core/program.cc
浏览文件 @
6554854a
...
...
@@ -147,7 +147,7 @@ void RuntimeProgram::Run() {
#endif // LITE_WITH_PROFILE
}
#ifdef LITE_WITH_PROFILE
LOG
(
INFO
)
<<
"
\n
"
<<
profiler_
.
Summary
(
false
,
0
);
LOG
(
INFO
)
<<
"
\n
"
<<
profiler_
.
Summary
(
profile
::
Type
::
kDispatch
,
false
,
0
);
#endif // LITE_WITH_PROFILE
}
...
...
@@ -252,8 +252,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
}
void
Instruction
::
Run
()
{
#ifdef LITE_WITH_PROFILE
CHECK
(
profiler_
)
<<
"Profiler pointer of kernel can not be nullptr. "
"When LITE_WITH_PROFILE is defined, please set a "
"Profiler for Instruction."
;
profiler_
->
StartTiming
(
profile
::
Type
::
kCreate
,
profile_id_
,
kernel_
->
mutable_context
());
#endif
CHECK
(
op_
)
<<
"op null"
;
CHECK
(
kernel_
)
<<
"kernel null"
;
if
(
first_epoch_
)
{
first_epoch_
=
false
;
CHECK
(
op_
->
CheckShape
());
...
...
@@ -263,10 +271,7 @@ void Instruction::Run() {
return
;
}
// VLOG(4) << "kernel launch";
op_
->
InferShape
();
// VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
// << TargetToStr(kernel_->target());
kernel_
->
Launch
();
has_run_
=
true
;
}
...
...
lite/core/program.h
浏览文件 @
6554854a
...
...
@@ -143,7 +143,8 @@ class LITE_API RuntimeProgram {
}
~
RuntimeProgram
()
{
#ifdef LITE_WITH_PROFILE
LOG
(
INFO
)
<<
"
\n
"
<<
profiler_
.
Summary
();
LOG
(
INFO
)
<<
"
\n
"
<<
profiler_
.
Summary
(
profile
::
Type
::
kCreate
);
LOG
(
INFO
)
<<
"
\n
"
<<
profiler_
.
Summary
(
profile
::
Type
::
kDispatch
);
#endif // LITE_WITH_PROFILE
}
...
...
lite/core/tensor.h
浏览文件 @
6554854a
...
...
@@ -233,6 +233,10 @@ class TensorLite {
(
static_cast
<
char
*>
(
buffer_
->
data
())
+
offset_
));
}
void
clear
()
{
buffer_
->
Free
();
offset_
=
0
;
}
size_t
data_size
()
const
{
return
this
->
dims
().
production
();
}
size_t
memory_size
()
const
{
return
memory_size_
;
}
...
...
lite/kernels/arm/conditional_block_compute.cc
浏览文件 @
6554854a
...
...
@@ -34,6 +34,9 @@ void ConditionalBlockCompute::PrepareForRun() {
}
void
ConditionalBlockCompute
::
Run
()
{
auto
&
param
=
Param
<
operators
::
ConditionalBlockParam
>
();
for
(
auto
&
out
:
param
.
outs
)
{
out
->
clear
();
}
bool
need_run
=
true
;
if
(
param
.
is_scalar_condition
)
{
auto
*
cond
=
param
.
cond
;
...
...
lite/kernels/arm/split_lod_tensor_compute.cc
浏览文件 @
6554854a
...
...
@@ -82,6 +82,10 @@ void SplitLodTensorCompute::Run() {
ranges
.
begin
(),
ranges
.
end
(),
0UL
,
[](
size_t
a
,
const
CopyRange
&
b
)
{
return
a
+
b
.
end
-
b
.
begin
;
});
if
(
height
==
0
)
{
out
->
clear
();
continue
;
}
auto
x_dim
=
x
->
dims
();
x_dim
[
0
]
=
static_cast
<
int64_t
>
(
height
);
out
->
Resize
(
x_dim
);
...
...
lite/kernels/arm/unsqueeze_compute.cc
浏览文件 @
6554854a
...
...
@@ -54,12 +54,12 @@ REGISTER_LITE_KERNEL(unsqueeze,
kNCHW
,
paddle
::
lite
::
kernels
::
host
::
UnsqueezeCompute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
)
,
PRECISION
(
kAny
)
)})
.
BindInput
(
"AxesTensor"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindInput
(
"AxesTensorList"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
)
,
PRECISION
(
kAny
)
)})
.
Finalize
();
REGISTER_LITE_KERNEL
(
unsqueeze2
,
...
...
@@ -68,11 +68,11 @@ REGISTER_LITE_KERNEL(unsqueeze2,
kNCHW
,
paddle
::
lite
::
kernels
::
host
::
Unsqueeze2Compute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
)
,
PRECISION
(
kAny
)
)})
.
BindInput
(
"AxesTensor"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindInput
(
"AxesTensorList"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
)
,
PRECISION
(
kAny
)
)})
.
BindOutput
(
"XShape"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
lite/kernels/arm/yolo_box_compute.cc
浏览文件 @
6554854a
...
...
@@ -54,7 +54,8 @@ REGISTER_LITE_KERNEL(yolo_box,
paddle
::
lite
::
kernels
::
arm
::
YoloBoxCompute
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"ImgSize"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindInput
(
"ImgSize"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
),
PRECISION
(
kInt32
))})
.
BindOutput
(
"Boxes"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
BindOutput
(
"Scores"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kARM
))})
.
Finalize
();
lite/kernels/cuda/softmax_compute.cu
浏览文件 @
6554854a
...
...
@@ -156,8 +156,8 @@ void SoftmaxCompute::PrepareForRun() {
cudaGetDevice
(
&
device_id
);
cudaDeviceProp
deviceProp
;
cudaGetDeviceProperties
(
&
deviceProp
,
device_id
);
sharedmem_size
=
deviceProp
.
sharedMemPerBlock
;
max_dimsize
=
sharedmem_size
/
sizeof
(
float
)
/
CUDA_NUM_THREADS
;
sharedmem_size
_
=
deviceProp
.
sharedMemPerBlock
;
max_dimsize
_
=
sharedmem_size_
/
sizeof
(
float
)
/
CUDA_NUM_THREADS
;
}
void
SoftmaxCompute
::
Run
()
{
...
...
@@ -174,29 +174,27 @@ void SoftmaxCompute::Run() {
int
outer_num
=
x_dims
.
Slice
(
0
,
axis
).
production
();
int
inner_num
=
x_dims
.
Slice
(
axis
+
1
,
x_rank
).
production
();
int
total_threads
=
inner_num
*
outer_num
;
int
axis_size
=
x_dims
[
axis
];
axis_size_
=
x_dims
[
axis
];
const
int
threads
=
CUDA_NUM_THREADS
;
const
int
blocks
=
(
total_threads
+
threads
-
1
)
/
threads
;
auto
input_data
=
param
.
x
->
data
<
float
>
();
auto
output_data
=
param
.
output
->
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
if
(
axis_size
<=
max_dimsize
)
{
int
use_sharemem_size
=
axis_size
*
threads
*
sizeof
(
float
);
if
(
axis_size
_
<=
max_dimsize_
)
{
int
use_sharemem_size
=
axis_size
_
*
threads
*
sizeof
(
float
);
sharemem_softmax_kernel
<<<
blocks
,
threads
,
use_sharemem_size
,
stream
>>>
(
total_threads
,
input_data
,
output_data
,
inner_num
,
outer_num
,
axis_size
);
axis_size
_
);
}
else
{
//! re_alloc device memory
Tensor
tmax_data
;
Tensor
tsum_data
;
tmax_data
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
tsum_data
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
auto
max_data
=
tmax_data
.
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
auto
sum_data
=
tsum_data
.
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
tmax_data_
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
tsum_data_
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
auto
max_data
=
tmax_data_
.
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
auto
sum_data
=
tsum_data_
.
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
//! firstly, get maximum data
float
min_data
=
std
::
numeric_limits
<
float
>::
lowest
();
softmax_max_kernel
<
float
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
...
...
@@ -205,7 +203,7 @@ void SoftmaxCompute::Run() {
min_data
,
inner_num
,
outer_num
,
axis_size
);
axis_size
_
);
//! then, compute exp and sum data
softmax_sub_exp_sum_kernel
<
float
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
...
...
@@ -215,10 +213,10 @@ void SoftmaxCompute::Run() {
sum_data
,
inner_num
,
outer_num
,
axis_size
);
axis_size
_
);
//! last, compute divided output
softmax_divid_output_kernel
<
float
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
output_data
,
sum_data
,
inner_num
,
outer_num
,
axis_size
);
total_threads
,
output_data
,
sum_data
,
inner_num
,
outer_num
,
axis_size
_
);
}
cudaError_t
error
=
cudaGetLastError
();
if
(
error
!=
cudaSuccess
)
LOG
(
ERROR
)
<<
cudaGetErrorString
(
error
);
...
...
lite/kernels/cuda/softmax_compute.h
浏览文件 @
6554854a
...
...
@@ -30,9 +30,11 @@ class SoftmaxCompute
virtual
~
SoftmaxCompute
()
=
default
;
private:
size_t
sharedmem_size
;
int
num_threads
;
int
max_dimsize
;
lite
::
Tensor
tmax_data_
;
lite
::
Tensor
tsum_data_
;
size_t
sharedmem_size_
;
int
max_dimsize_
;
int
axis_size_
;
};
}
// namespace cuda
...
...
lite/kernels/npu/bridges/engine.h
浏览文件 @
6554854a
...
...
@@ -28,12 +28,14 @@ namespace subgraph {
class
Engine
{
public:
Engine
(
int
block_idx
,
Engine
(
KernelContext
*
ctx
,
int
block_idx
,
cpp
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_names
,
lite
::
Scope
*
scope
)
:
block_idx_
(
block_idx
),
:
ctx_
(
ctx
),
block_idx_
(
block_idx
),
block_desc_
(
block_desc
),
input_names_
(
input_names
),
output_names_
(
output_names
),
...
...
@@ -55,6 +57,7 @@ class Engine {
virtual
bool
InputShapeChanged
();
KernelContext
*
ctx_
{
nullptr
};
int
block_idx_
;
cpp
::
BlockDesc
*
block_desc_
;
std
::
vector
<
std
::
string
>
input_names_
;
...
...
lite/kernels/npu/subgraph_compute.cc
浏览文件 @
6554854a
...
...
@@ -207,7 +207,8 @@ int SubgraphEngine::LaunchDeviceProgram() {
void
SubgraphCompute
::
PrepareForRun
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
engine_
.
reset
(
new
SubgraphEngine
(
param
.
sub_block_idx
,
engine_
.
reset
(
new
SubgraphEngine
(
ctx_
.
get
(),
param
.
sub_block_idx
,
param
.
sub_block_desc
,
param
.
input_data_names
,
param
.
output_data_names
,
...
...
lite/kernels/npu/subgraph_compute.h
浏览文件 @
6554854a
...
...
@@ -29,13 +29,14 @@ namespace npu {
class
SubgraphEngine
:
public
subgraph
::
Engine
{
public:
SubgraphEngine
(
int
block_idx
,
SubgraphEngine
(
KernelContext
*
ctx
,
int
block_idx
,
cpp
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_names
,
Scope
*
scope
)
:
subgraph
::
Engine
(
block_idx
,
block_desc
,
input_names
,
output_names
,
scope
)
{}
ctx
,
block_idx
,
block_desc
,
input_names
,
output_names
,
scope
)
{}
protected:
int
BuildDeviceProgram
()
override
;
...
...
lite/kernels/x86/gru_compute.cc
浏览文件 @
6554854a
...
...
@@ -13,10 +13,13 @@
// limitations under the License.
#include "lite/kernels/x86/gru_compute.h"
#include "lite/utils/env.h"
DEFINE_int32
(
paddle_num_threads
,
1
,
"Number of threads for each paddle instance."
);
// DEFINE_int32(paddle_num_threads,
// 1,
// "Number of threads for each paddle instance.");
int32_t
paddle_num_threads
=
paddle
::
lite
::
GetIntFromEnv
(
"paddle_num_threads"
,
1
);
REGISTER_LITE_KERNEL
(
gru
,
kX86
,
...
...
lite/kernels/x86/gru_compute.h
浏览文件 @
6554854a
...
...
@@ -26,7 +26,8 @@
#include "lite/core/types.h"
#include "lite/fluid/eigen.h"
DECLARE_int32
(
paddle_num_threads
);
// DECLARE_int32(paddle_num_threads);
extern
int32_t
paddle_num_threads
;
namespace
paddle
{
namespace
lite
{
...
...
@@ -109,7 +110,7 @@ class GRUCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
#ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM
if
(
FLAGS_
paddle_num_threads
>=
4
)
{
if
(
paddle_num_threads
>=
4
)
{
auto
blas
=
lite
::
x86
::
math
::
GetBlas
<
TARGET
(
kX86
),
T
>
(
context
);
T
*
packed_gate
=
blas
.
GEMM_ALLOC
(
CblasBMatrix
,
1
/*height of C*/
,
...
...
lite/kernels/xpu/bridges/matmul_op.cc
浏览文件 @
6554854a
...
...
@@ -49,9 +49,10 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
out_type
=
kernel
->
GetOutputDeclType
(
"Out"
);
CHECK
(
out_type
->
precision
()
==
PRECISION
(
kFloat
));
CHECK
(
out_type
->
layout
()
==
DATALAYOUT
(
kNCHW
));
auto
out
=
scope
->
FindMutableTensor
(
out_name
);
auto
out_dims
=
out
->
dims
();
auto
transpose_x
=
op_info
->
GetAttr
<
bool
>
(
"transpose_X"
);
CHECK
(
!
transpose_x
)
<<
"XPU only support transpose_x == true now"
;
auto
transpose_y
=
op_info
->
GetAttr
<
bool
>
(
"transpose_Y"
);
auto
alpha
=
op_info
->
GetAttr
<
float
>
(
"alpha"
);
...
...
@@ -71,11 +72,68 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
y_node
=
graph
->
AddNode
(
y_name
,
y_dims
);
}
auto
matmul_node
=
graph
->
builder_
.
CreateMatmul2D
(
*
x_node
,
*
y_node
,
transpose_y
);
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateScale
(
matmul_node
,
alpha
));
return
SUCCESS
;
// Matmul node
if
(
x_dims
.
size
()
>
2
&&
y_dims
.
size
()
>=
2
)
{
// x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
// x: [B, M, K], y: [K, N], out: [B, M, N]
// Reshape and transposed X node
if
(
x_dims
.
size
()
!=
3
)
{
auto
m
=
static_cast
<
int
>
(
x_dims
[
x_dims
.
size
()
-
2
]);
auto
k
=
static_cast
<
int
>
(
x_dims
[
x_dims
.
size
()
-
1
]);
x_node
=
graph
->
AddNode
(
x_name
+
"/reshape"
,
graph
->
builder_
.
CreateReshape
(
*
x_node
,
{
-
1
,
m
,
k
}));
if
(
transpose_x
)
{
x_node
=
graph
->
AddNode
(
x_name
+
"/reshape/transpose"
,
graph
->
builder_
.
CreateTranspose
(
*
x_node
,
{
0
,
2
,
1
}));
}
}
// Reshape and transposed Y node
if
(
y_dims
.
size
()
!=
3
)
{
auto
k
=
static_cast
<
int
>
(
y_dims
[
y_dims
.
size
()
-
2
]);
auto
n
=
static_cast
<
int
>
(
y_dims
[
y_dims
.
size
()
-
1
]);
y_node
=
graph
->
AddNode
(
y_name
+
"/reshape"
,
graph
->
builder_
.
CreateReshape
(
*
y_node
,
{
-
1
,
k
,
n
}));
if
(
!
transpose_y
)
{
y_node
=
graph
->
AddNode
(
y_name
+
"/reshape/transpose"
,
graph
->
builder_
.
CreateTranspose
(
*
y_node
,
{
0
,
2
,
1
}));
}
}
// Matmul node
auto
matmul_node
=
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateBatchMatmul
(
*
x_node
,
*
y_node
));
if
(
fabs
(
alpha
-
1
)
>
1e-6
f
)
{
matmul_node
=
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateScale
(
*
matmul_node
,
alpha
));
}
if
(
out_dims
.
size
()
!=
3
)
{
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateReshape
(
*
matmul_node
,
CvtShape
<
xtcl
::
Integer
>
(
out_dims
)));
}
}
else
if
(
x_dims
.
size
()
==
2
&&
y_dims
.
size
()
==
2
)
{
// x: [M, K], y: [K, N], out: [M, N]
if
(
transpose_x
)
{
x_node
=
graph
->
AddNode
(
x_name
+
"/transpose"
,
graph
->
builder_
.
CreateTranspose
(
*
x_node
,
{
1
,
0
}));
}
auto
matmul_node
=
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateMatmul2D
(
*
x_node
,
*
y_node
,
transpose_y
));
if
(
fabs
(
alpha
-
1
)
>
1e-6
f
)
{
matmul_node
=
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateScale
(
*
matmul_node
,
alpha
));
}
}
else
if
(
x_dims
.
size
()
==
1
&&
y_dims
.
size
()
==
1
)
{
// x: [K], y: [K], out: [1]
// x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
LOG
(
FATAL
)
<<
"[XPU] Not supported."
;
return
FAILED
;
}
return
REBUILD_WHEN_SHAPE_CHANGED
;
}
}
// namespace xpu
...
...
lite/kernels/xpu/bridges/mul_op.cc
浏览文件 @
6554854a
...
...
@@ -67,15 +67,27 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
x_node
=
graph
->
AddNode
(
x_name
+
"/reshape"
,
graph
->
builder_
.
CreateReshape
(
*
x_node
,
{
-
1
,
static_cast
<
int
>
(
y_matrix_dims
[
0
])}));
*
x_node
,
{
-
1
,
static_cast
<
int
>
(
x_matrix_dims
[
1
])}));
}
// Y node
auto
y_const_node
=
graph
->
AddNode
(
y_name
,
*
y
,
y_matrix_dims
);
std
::
shared_ptr
<
xtcl
::
xExpr
>
y_node
=
nullptr
;
if
(
graph
->
HasNode
(
y_name
))
{
y_node
=
graph
->
GetNode
(
y_name
);
}
else
{
y_node
=
graph
->
AddNode
(
y_name
,
y_dims
);
}
// Flatten Y node
if
(
y_dims
.
size
()
!=
2
)
{
y_node
=
graph
->
AddNode
(
y_name
+
"/reshape"
,
graph
->
builder_
.
CreateReshape
(
*
y_node
,
{
static_cast
<
int
>
(
y_matrix_dims
[
0
]),
-
1
}));
}
// Reshape the matmul node with the inferred shape as the output node
auto
matmul_node
=
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateMatmul2D
(
*
x_node
,
*
y_
const_
node
,
false
));
out_name
,
graph
->
builder_
.
CreateMatmul2D
(
*
x_node
,
*
y_node
,
false
));
if
(
out_dims
.
size
()
!=
2
)
{
graph
->
AddNode
(
out_name
,
graph
->
builder_
.
CreateReshape
(
...
...
lite/kernels/xpu/subgraph_compute.cc
浏览文件 @
6554854a
...
...
@@ -197,7 +197,8 @@ int SubgraphEngine::LaunchDeviceProgram() {
void
SubgraphCompute
::
PrepareForRun
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
engine_
.
reset
(
new
SubgraphEngine
(
param
.
sub_block_idx
,
engine_
.
reset
(
new
SubgraphEngine
(
ctx_
.
get
(),
param
.
sub_block_idx
,
param
.
sub_block_desc
,
param
.
input_data_names
,
param
.
output_data_names
,
...
...
lite/kernels/xpu/subgraph_compute.h
浏览文件 @
6554854a
...
...
@@ -29,13 +29,14 @@ namespace xpu {
class
SubgraphEngine
:
public
subgraph
::
Engine
{
public:
SubgraphEngine
(
int
block_idx
,
SubgraphEngine
(
KernelContext
*
ctx
,
int
block_idx
,
cpp
::
BlockDesc
*
block_desc
,
const
std
::
vector
<
std
::
string
>
&
input_names
,
const
std
::
vector
<
std
::
string
>
&
output_names
,
Scope
*
scope
)
:
subgraph
::
Engine
(
block_idx
,
block_desc
,
input_names
,
output_names
,
scope
)
{}
ctx
,
block_idx
,
block_desc
,
input_names
,
output_names
,
scope
)
{}
protected:
int
BuildDeviceProgram
()
override
;
...
...
lite/operators/CMakeLists.txt
浏览文件 @
6554854a
...
...
@@ -50,6 +50,7 @@ add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
add_operator
(
instance_norm_op basic SRCS instance_norm_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
subgraph_op basic SRCS subgraph_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
grid_sampler_op basic SRCS grid_sampler_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
flatten_op basic SRCS flatten_op.cc DEPS
${
op_DEPS
}
)
# 2.basic ops not used in basic models
add_operator
(
negative_op extra SRCS negative_op.cc DEPS
${
op_DEPS
}
)
...
...
@@ -78,11 +79,9 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP
add_operator
(
generate_proposals_op extra SRCS generate_proposals_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
roi_align_op extra SRCS roi_align_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
box_clip_op extra SRCS box_clip_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
flatten_op extra SRCS flatten_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS
${
op_DEPS
}
)
add_operator
(
sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
assign_value_op extra SRCS assign_value_op.cc DEPS
${
op_DEPS
}
)
add_operator
(
fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS
${
op_DEPS
}
)
add_operator
(
fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS
${
op_DEPS
}
)
add_operator
(
split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS
${
op_DEPS
}
)
...
...
lite/operators/attention_padding_mask_op.cc
浏览文件 @
6554854a
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/operators/attention_padding_mask_op.h"
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/core/scope.h"
...
...
@@ -39,7 +40,8 @@ bool AttentionPaddingMaskOp::InferShape() const {
<<
"Mismatch batch size, bottom0: "
<<
att_batch
<<
", bottom1: "
<<
src_batch
;
param_
.
pad_begin
->
Resize
({
static_cast
<
int64_t
>
(
src_batch
)});
param_
.
pad_begin
->
Resize
(
std
::
vector
<
int64_t
>
({
static_cast
<
int64_t
>
(
src_batch
)}));
param_
.
Out
->
Resize
(
param_
.
X
->
dims
());
param_
.
Out
->
set_lod
(
param_
.
X
->
lod
());
...
...
lite/operators/instance_norm_op.cc
浏览文件 @
6554854a
...
...
@@ -46,8 +46,9 @@ bool InstanceNormOp::InferShape() const {
auto
x_dims
=
param_
.
x
->
dims
();
int64_t
batch_size
=
x_dims
[
0
];
int64_t
channel_size
=
x_dims
[
1
];
param_
.
saved_mean
->
Resize
({
batch_size
*
channel_size
});
param_
.
saved_variance
->
Resize
({
batch_size
*
channel_size
});
param_
.
saved_mean
->
Resize
(
std
::
vector
<
int64_t
>
({
batch_size
*
channel_size
}));
param_
.
saved_variance
->
Resize
(
std
::
vector
<
int64_t
>
({
batch_size
*
channel_size
}));
param_
.
out
->
Resize
(
x_dims
);
return
true
;
}
...
...
lite/operators/reduce_prod_op.cc
浏览文件 @
6554854a
...
...
@@ -50,7 +50,7 @@ bool ReduceProdOpLite::InferShape() const {
if
(
keep_dim
)
{
out
->
Resize
({
static_cast
<
int64_t
>
(
x_rank
),
1
});
}
else
{
out
->
Resize
(
{
1
}
);
out
->
Resize
(
std
::
vector
<
int64_t
>
({
1L
})
);
}
}
else
{
auto
dims_vector
=
x_dims
.
Vectorize
();
...
...
lite/tests/kernels/CMakeLists.txt
浏览文件 @
6554854a
...
...
@@ -30,6 +30,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
lite_cc_test
(
test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
x86_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
x86_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
x86_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
lite_cc_test
(
test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
x86_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
if
(
LITE_BUILD_EXTRA
)
lite_cc_test
(
test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework
${
x86_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
)
...
...
lite/tests/kernels/mul_compute_test.cc
0 → 100644
浏览文件 @
6554854a
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <cmath>
#include <string>
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/core/arena/framework.h"
#include "lite/tests/utils/fill_data.h"
namespace
paddle
{
namespace
lite
{
class
MulComputeTester
:
public
arena
::
TestCase
{
protected:
// common attributes for this op.
std
::
string
type_
=
"mul"
;
std
::
string
x_
=
"x"
;
std
::
string
y_
=
"y"
;
std
::
string
out_
=
"out"
;
DDim
x_dims_
{{
1
,
2
}};
DDim
y_dims_
{{
2
,
1
}};
int
x_num_col_dims_
{
1
};
int
y_num_col_dims_
{
1
};
public:
MulComputeTester
(
const
Place
&
place
,
const
std
::
string
&
alias
,
DDim
x_dims
,
DDim
y_dims
,
int
x_num_col_dims
,
int
y_num_col_dims
)
:
TestCase
(
place
,
alias
),
x_dims_
(
x_dims
),
y_dims_
(
y_dims
),
x_num_col_dims_
(
x_num_col_dims
),
y_num_col_dims_
(
y_num_col_dims
)
{}
void
RunBaseline
(
Scope
*
scope
)
override
{
auto
*
x
=
scope
->
FindTensor
(
x_
);
auto
*
y
=
scope
->
FindTensor
(
y_
);
auto
x_mat_dims
=
x_dims_
.
Flatten2D
(
x_num_col_dims_
);
auto
y_mat_dims
=
y_dims_
.
Flatten2D
(
y_num_col_dims_
);
CHECK_EQ
(
x_mat_dims
[
1
],
y_mat_dims
[
0
]);
auto
*
out
=
scope
->
NewTensor
(
out_
);
CHECK
(
out
);
std
::
vector
<
int64_t
>
out_shape
;
for
(
int
i
=
0
;
i
<
x_num_col_dims_
;
i
++
)
{
out_shape
.
push_back
(
x_dims_
[
i
]);
}
for
(
int
i
=
y_num_col_dims_
;
i
<
y_dims_
.
size
();
i
++
)
{
out_shape
.
push_back
(
y_dims_
[
i
]);
}
out
->
Resize
(
DDim
(
out_shape
));
auto
x_data
=
x
->
data
<
float
>
();
auto
y_data
=
y
->
data
<
float
>
();
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
const
int
M
=
x_mat_dims
[
0
];
const
int
K
=
x_mat_dims
[
1
];
const
int
N
=
y_mat_dims
[
1
];
for
(
int
m
=
0
;
m
<
M
;
++
m
)
{
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
out_data
[
m
*
N
+
n
]
=
0
;
for
(
int
k
=
0
;
k
<
K
;
++
k
)
{
out_data
[
m
*
N
+
n
]
+=
x_data
[
m
*
K
+
k
]
*
y_data
[
k
*
N
+
n
];
}
}
}
}
void
PrepareOpDesc
(
cpp
::
OpDesc
*
op_desc
)
{
op_desc
->
SetType
(
type_
);
op_desc
->
SetInput
(
"X"
,
{
x_
});
op_desc
->
SetInput
(
"Y"
,
{
y_
});
op_desc
->
SetOutput
(
"Out"
,
{
out_
});
op_desc
->
SetAttr
(
"x_num_col_dims"
,
x_num_col_dims_
);
op_desc
->
SetAttr
(
"y_num_col_dims"
,
y_num_col_dims_
);
}
void
PrepareData
()
override
{
std
::
vector
<
float
>
x
(
x_dims_
.
production
());
fill_data_rand
(
x
.
data
(),
-
1.
f
,
1.
f
,
x_dims_
.
production
());
SetCommonTensor
(
x_
,
x_dims_
,
x
.
data
());
std
::
vector
<
float
>
y
(
y_dims_
.
production
());
fill_data_rand
(
y
.
data
(),
-
1.
f
,
1.
f
,
y_dims_
.
production
());
SetCommonTensor
(
y_
,
y_dims_
,
y
.
data
());
}
};
void
TestMul
(
const
std
::
vector
<
int64_t
>&
x_dims
,
const
std
::
vector
<
int64_t
>&
y_dims
,
int
x_num_col_dims
,
int
y_num_col_dims
,
const
Place
&
place
,
float
abs_error
)
{
std
::
unique_ptr
<
arena
::
TestCase
>
tester
(
new
MulComputeTester
(
place
,
"def"
,
DDim
(
x_dims
),
DDim
(
y_dims
),
x_num_col_dims
,
y_num_col_dims
));
arena
::
Arena
arena
(
std
::
move
(
tester
),
place
,
abs_error
);
arena
.
TestPrecision
();
}
TEST
(
Mul
,
precision
)
{
LOG
(
INFO
)
<<
"test mul op"
;
float
abs_error
=
2e-5
;
Place
place
;
#if defined(LITE_WITH_XPU)
place
=
TARGET
(
kXPU
);
#else
return
;
#endif
TestMul
({
4
,
5
},
{
5
,
4
},
1
,
1
,
place
,
abs_error
);
TestMul
({
4
,
5
},
{
5
,
4
,
3
,
2
},
1
,
1
,
place
,
abs_error
);
TestMul
({
4
,
20
},
{
5
,
4
,
3
,
2
},
1
,
2
,
place
,
abs_error
);
TestMul
({
4
,
60
},
{
5
,
4
,
3
,
2
},
1
,
3
,
place
,
abs_error
);
TestMul
({
2
,
3
,
4
,
5
},
{
60
,
4
},
1
,
1
,
place
,
abs_error
);
TestMul
({
2
,
3
,
4
,
5
},
{
20
,
4
},
2
,
1
,
place
,
abs_error
);
TestMul
({
2
,
3
,
4
,
5
},
{
5
,
4
},
3
,
1
,
place
,
abs_error
);
TestMul
({
2
,
3
,
4
,
5
},
{
60
,
3
,
4
,
5
},
1
,
1
,
place
,
abs_error
);
TestMul
({
2
,
3
,
4
,
5
},
{
4
,
5
,
6
,
2
},
2
,
2
,
place
,
abs_error
);
TestMul
({
2
,
3
,
4
,
5
},
{
5
,
1
,
4
,
2
},
3
,
2
,
place
,
abs_error
);
}
}
// namespace lite
}
// namespace paddle
lite/tests/kernels/unsqueeze_compute_test.cc
浏览文件 @
6554854a
...
...
@@ -107,6 +107,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
}
void
PrepareData
()
override
{
SetPrecisionType
(
out_
,
PRECISION
(
kFloat
));
std
::
vector
<
float
>
in_data
(
dims_
.
production
());
for
(
int
i
=
0
;
i
<
dims_
.
production
();
++
i
)
{
in_data
[
i
]
=
i
;
...
...
@@ -213,6 +214,7 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
}
void
PrepareData
()
override
{
SetPrecisionType
(
out_
,
PRECISION
(
kFloat
));
std
::
vector
<
float
>
in_data
(
dims_
.
production
());
for
(
int
i
=
0
;
i
<
dims_
.
production
();
++
i
)
{
in_data
[
i
]
=
i
;
...
...
lite/tools/ci_build.sh
浏览文件 @
6554854a
...
...
@@ -1042,23 +1042,6 @@ function main {
build_test_arm_subtask_armlinux
shift
;;
build_test_arm_model_mobilenetv1
)
build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
build_test_arm_subtask_model test_mobilenetv1_int8 MobileNetV1_quant
shift
;;
build_test_arm_model_mobilenetv2
)
build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
shift
;;
build_test_arm_model_resnet50
)
build_test_arm_subtask_model test_resnet50 resnet50
shift
;;
build_test_arm_model_inceptionv4
)
build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
shift
;;
check_style
)
check_style
shift
...
...
lite/utils/env.h
0 → 100644
浏览文件 @
6554854a
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <string>
namespace
paddle
{
namespace
lite
{
static
std
::
string
GetStringFromEnv
(
const
std
::
string
&
str
,
const
std
::
string
&
def
=
""
)
{
char
*
variable
=
std
::
getenv
(
str
.
c_str
());
if
(
!
variable
)
{
return
def
;
}
return
std
::
string
(
variable
);
}
static
bool
GetBoolFromEnv
(
const
std
::
string
&
str
,
bool
def
=
false
)
{
char
*
variable
=
std
::
getenv
(
str
.
c_str
());
if
(
!
variable
)
{
return
def
;
}
if
(
strcmp
(
variable
,
"false"
)
==
0
||
strcmp
(
variable
,
"0"
)
==
0
)
{
return
false
;
}
else
{
return
true
;
}
}
static
int
GetIntFromEnv
(
const
std
::
string
&
str
,
int
def
=
0
)
{
char
*
variable
=
std
::
getenv
(
str
.
c_str
());
if
(
!
variable
)
{
return
def
;
}
return
atoi
(
variable
);
}
static
double
GetDoubleFromEnv
(
const
std
::
string
&
str
,
double
def
=
0.0
)
{
char
*
variable
=
std
::
getenv
(
str
.
c_str
());
if
(
!
variable
)
{
return
def
;
}
return
atof
(
variable
);
}
static
uint64_t
GetUInt64FromEnv
(
const
std
::
string
&
str
,
uint64_t
def
=
0ul
)
{
char
*
variable
=
std
::
getenv
(
str
.
c_str
());
if
(
!
variable
)
{
return
def
;
}
return
static_cast
<
uint64_t
>
(
atol
(
variable
));
}
}
// namespace lite
}
// namespace paddle
mobile/src/framework/cl/cl_image.cpp
浏览文件 @
6554854a
...
...
@@ -18,6 +18,37 @@ limitations under the License. */
namespace
paddle_mobile
{
namespace
framework
{
void
CLImage
::
PrintTensor
(
const
CLImage
&
cl_image
)
const
{
size_t
width
=
cl_image
.
ImageDims
()[
0
];
size_t
height
=
cl_image
.
ImageDims
()[
1
];
half_t
*
image_data
=
new
half_t
[
height
*
width
*
4
];
cl_int
err
;
cl_mem
image
=
cl_image
.
GetCLImage
();
size_t
origin
[
3
]
=
{
0
,
0
,
0
};
size_t
region
[
3
]
=
{
width
,
height
,
1
};
err
=
clEnqueueReadImage
(
cl_image
.
CommandQueue
(),
image
,
CL_TRUE
,
origin
,
region
,
0
,
0
,
image_data
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
err
);
PADDLE_MOBILE_ENFORCE
(
cl_image
.
numel
()
!=
0
,
"cl_image numel should not be 0 "
);
float
*
tensor_data
=
new
float
[
cl_image
.
numel
()];
auto
converter
=
cl_image
.
Converter
();
converter
->
ImageToNCHW
(
image_data
,
tensor_data
,
cl_image
.
ImageDims
(),
cl_image
.
dims
());
int
stride
=
cl_image
.
numel
()
/
20
;
stride
=
stride
>
0
?
stride
:
1
;
for
(
int
i
=
0
;
i
<
cl_image
.
numel
();
i
++
)
{
printf
(
"%f
\n
"
,
tensor_data
[
i
]);
}
delete
[](
tensor_data
);
delete
[](
image_data
);
}
void
CLImageToTensor
(
CLImage
*
cl_image
,
Tensor
*
tensor
,
cl_context
context
,
cl_command_queue
commandQueue
,
cl_kernel
kernel
)
{
tensor
->
mutable_data
<
float
>
();
...
...
mobile/src/framework/cl/cl_image.h
浏览文件 @
6554854a
...
...
@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <iostream>
#include <memory>
#include <vector>
...
...
@@ -285,6 +286,7 @@ class CLImage {
cl_event
GetClEvent
()
const
{
return
cl_event_
.
get
();
}
CLImageConverterBase
*
Converter
()
const
{
return
image_converter_
;
}
void
PrintTensor
(
const
CLImage
&
cl_image
)
const
;
private:
void
InitCLImage
(
cl_context
context
,
size_t
width
,
size_t
height
,
...
...
mobile/src/framework/cl/cl_tool.h
浏览文件 @
6554854a
...
...
@@ -21,13 +21,14 @@ namespace framework {
const
char
*
opencl_error_to_str
(
cl_int
error
);
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"OpenCL error with code %s happened in file %s at line %d. " \
"Exiting.\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
#define CL_CHECK_ERRORS(ERR) \
if (ERR != CL_SUCCESS) { \
printf( \
"\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
"%d. " \
"Exiting.\033[0m\n", \
paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__, \
__LINE__); \
}
}
// namespace framework
...
...
mobile/src/framework/executor.cpp
浏览文件 @
6554854a
...
...
@@ -363,7 +363,10 @@ void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
DLOG
<<
"InitNoPersistableMemory var "
<<
var_desc
->
Name
();
auto
tensor
=
var
->
template
GetMutable
<
LoDTensor
>();
if
(
tensor
->
IsInitialized
()
&&
tensor
->
dims
().
size
()
==
4
)
{
DLOG
<<
"var's tensor is Initialized or dims size != 4"
;
// don't change user's input and avoid memory leaks
if
(
feed_indices_
.
find
(
var_desc
->
Name
())
!=
feed_indices_
.
end
())
{
break
;
}
DDim
tensor_dim
=
tensor
->
dims
();
DDim
new_dim
=
make_ddim
({
tensor_dim
[
0
],
tensor_dim
[
1
],
input_tensor
.
dims
()[
2
],
...
...
mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
浏览文件 @
6554854a
...
...
@@ -241,7 +241,9 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
cl_int
status
;
int
index
=
0
;
if
(
param
.
Filter
()
->
dims
()[
2
]
==
1
&&
param
.
Filter
()
->
dims
()[
3
]
==
1
)
{
const
int
filter_height
=
param
.
Filter
()
->
dims
()[
2
];
const
int
filter_width
=
param
.
Filter
()
->
dims
()[
3
];
if
(
filter_height
==
1
&&
filter_width
==
1
)
{
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
c_block
);
CL_CHECK_ERRORS
(
status
);
...
...
@@ -404,7 +406,7 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
output_height
);
CL_CHECK_ERRORS
(
status
);
if
(
param
.
Filter
()
->
dims
()[
2
]
==
3
&&
param
.
Filter
()
->
dims
()[
3
]
==
3
)
{
if
(
filter_height
==
3
&&
filter_width
==
3
)
{
// normal conv
if
(
param
.
Filter
()
->
dims
()[
0
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Filter
()
->
dims
()[
1
]
==
param
.
Input
()
->
dims
()[
1
])
{
...
...
@@ -425,6 +427,17 @@ void ConvAddBnRelu(framework::CLHelper *cl_helper,
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
group
);
CL_CHECK_ERRORS
(
status
);
}
}
else
if
(
filter_height
!=
3
&&
filter_width
!=
3
)
{
// not 3x3
if
(
param
.
Filter
()
->
dims
()[
1
]
==
1
&&
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
])
{
// deepwise basic use in not 3x3
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
filter_width
);
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
index
++
,
sizeof
(
int
),
&
filter_height
);
CL_CHECK_ERRORS
(
status
);
}
}
status
=
clEnqueueNDRangeKernel
(
...
...
mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
100755 → 100644
浏览文件 @
6554854a
...
...
@@ -24,980 +24,1101 @@ conv_add_bn_relu
#
include
"cl_common.h"
__kernel
void
conv_3x3
(
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2,
__read_only
image2d_t
input_image,
__read_only
image2d_t
filter,
__kernel
void
conv_3x3
(
__private
const
int
global_size_dim0,
__private
const
int
global_size_dim1,
__private
const
int
global_size_dim2,
__read_only
image2d_t
input_image,
__read_only
image2d_t
filter,
#
if
defined
(
BIASE_CH
)
|
| defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height,
__private const int output_c,
__private const int filter_channel,
__private const int group) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int output_c, __private const int filter_channel,
__private const int group) {
int2 stride_xy
;
stride_xy.x = stride
;
stride_xy.y = stride
;
const int out_c = get_global_id(0)
;
const int out_w = get_global_id(1)
;
const int out_nh = get_global_id(2)
;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride
;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
half4 input[9];
if (group == 1) {
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
half4 output = 0.0f;
#endif
half4 input[9];
if (group == 1) {
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input[0] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler,
pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler,
pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler,
pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler,
pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
} else {
for (int i = 0; i < 4; i++) {
int used_input_channel_num =
(out_c * 4 + i) / (output_c / group) * filter_channel;
for (int f_c = 0; f_c < filter_channel; ++f_c) {
int input_c = used_input_channel_num + f_c;
int input_block = input_c / 4;
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
in_pos_in_one_block.y);
input[0] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
half tmp_out = 0;
for (int j = 0; j < 9; j++) {
int2 pos_of_weight;
pos_of_weight.x = (f_c / 4) * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
half4 weight = read_imageh(filter, sampler, pos_of_weight);
int f_c_offset = f_c % 4;
half f_value;
if (f_c_offset == 0) {
f_value = weight.x;
} else if (f_c_offset == 1) {
f_value = weight.y;
} else if (f_c_offset == 2) {
f_value = weight.z;
} else if (f_c_offset == 3) {
f_value = weight.w;
}
int input_c_offset = input_c % 4;
half input_value;
if (input_c_offset == 0) {
input_value = input[j].x;
} else if (input_c_offset == 1) {
input_value = input[j].y;
} else if (input_c_offset == 2) {
input_value = input[j].z;
} else if (input_c_offset == 3) {
input_value = input[j].w;
}
tmp_out += f_value * input_value;
}
} else {
for (int i = 0; i < 4; i++) {
int used_input_channel_num = (out_c * 4 + i) / (output_c / group) * filter_channel;
for (int f_c = 0; f_c < filter_channel; ++f_c) {
int input_c = used_input_channel_num + f_c;
int input_block = input_c / 4;
int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
half tmp_out = 0;
for (int j = 0; j < 9; j++) {
int2 pos_of_weight;
pos_of_weight.x = (f_c / 4) * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
half4 weight = read_imageh(filter, sampler, pos_of_weight);
int f_c_offset = f_c % 4;
half f_value;
if (f_c_offset == 0) {
f_value = weight.x;
} else if (f_c_offset == 1) {
f_value = weight.y;
} else if (f_c_offset == 2) {
f_value = weight.z;
} else if (f_c_offset == 3) {
f_value = weight.w;
}
int input_c_offset = input_c % 4;
half input_value;
if (input_c_offset == 0) {
input_value = input[j].x;
} else if (input_c_offset == 1) {
input_value = input[j].y;
} else if (input_c_offset == 2) {
input_value = input[j].z;
} else if (input_c_offset == 3) {
input_value = input[j].w;
}
tmp_out += f_value * input_value;
}
if (i == 0) {
output.x += tmp_out;
} else if (i == 1) {
output.y += tmp_out;
} else if (i == 2) {
output.z += tmp_out;
} else if (i == 3) {
output.w += tmp_out;
}
}
if (i == 0) {
output.x += tmp_out;
} else if (i == 1) {
output.y += tmp_out;
} else if (i == 2) {
output.z += tmp_out;
} else if (i == 3) {
output.w += tmp_out;
}
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
// dilation == 1
__kernel void conv_3x3spl(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
// dilation == 1
__kernel void conv_3x3spl(
__private const int item_ch, __private const int item_w,
__private const int item_h, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int pad, __private const int dilation,
__private const int in_ch, __private const int in_w,
__private const int in_h, __private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
}
half4 output[5];
output[0] =
read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 || out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3
input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * 3;
int filter_h_val1 = filter_h_val0 + 3;
int filter_h_val2 = filter_h_val1 + 3;
int filter_h_val3 = filter_h_val2 + 3;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * 3;
for (int h = 0; h < 3; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < 3; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] =
read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] =
read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] =
read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] =
read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] =
read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] =
mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] =
mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] =
mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] =
mad(scale, output[4], biase);
}
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] =
mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] =
mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] =
mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] =
mad(scale, output[4], biase);
}
#endif
#ifdef RELU
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
}
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
__kernel void depth_conv_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input,
__read_only image2d_t filter,
__kernel void depth_conv_3x3(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const int batch_index = out_nh / output_height;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
const int out_nh_in_one_batch = out_nh %
output_height;
const int batch_index = out_nh /
output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
half4 output = 0.0f;
#endif
const int filter_width = 3;
const int filter_height = 3;
int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x ;
int filter_y = pos_in_filter_block.y ;
half4 inputs[9];
inputs[0] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[1] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[2] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
inputs[3] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
/*
if (output_pos.x == 112 && output_pos.y == 0) {
half4 input1 = inputs[3];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 3 - %v4hlf \n", in);
printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
}
*/
inputs[4] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
inputs[5] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
inputs[6] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
inputs[7] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
inputs[8] = select(read_imageh(input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
half4 filters[9];
filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y));
filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
for(int i = 0 ;i < 9 ; i++){
output += inputs[i] * filters[i];
}
const int filter_width = 3;
const int filter_height = 3;
int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y;
half4 inputs[9];
inputs[0] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[1] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[2] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y - 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y - 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y - 1 >= input_height)
<< 15));
inputs[3] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
/*
if (output_pos.x == 112 && output_pos.y == 0) {
half4 input1 = inputs[3];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 3 - %v4hlf \n", in);
printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
}
*/
inputs[4] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
inputs[5] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
inputs[6] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x - 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[7] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
inputs[8] = select(
read_imageh(input, sampler,
(int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
pos_in_input_block.y + in_pos_in_one_block.y + 1)),
(half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
in_pos_in_one_block.y + 1 < 0 ||
in_pos_in_one_block.x + 1 >= input_width ||
in_pos_in_one_block.y + 1 >= input_height)
<< 15));
half4 filters[9];
filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
for (int i = 0; i < 9; i++) {
output += inputs[i] * filters[i];
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
/*
if (output_pos.x == 112 && output_pos.y == 0) {
for (int i = 0; i < 9; ++i) {
half4 input1 = inputs[i];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 %d - %v4hlf \n", i, in);
}
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" depth wise output output4 = %v4hlf \n", out);
printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
}
*/
/*
if (output_pos.x == 112 && output_pos.y == 0) {
for (int i = 0; i < 9; ++i) {
half4 input1 = inputs[i];
float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
printf(" input4 %d - %v4hlf \n", i, in);
}
float4 out = (float4)(output.x, output.y, output.z, output.w);
printf(" depth wise output output4 = %v4hlf \n", out);
printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
}
*/
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void depth_conv_3x3s1(__private const int ou_ch_blk,
__private const int ou_w_blk,
__private const int ou_nh,
__read_only image2d_t input,
__read_only image2d_t filter,
__kernel void depth_conv_3x3s1(
__private const int ou_ch_blk, __private const int ou_w_blk,
__private const int ou_nh, __read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,/* of one block */
__private const int in_h, /* of one block */
__private const int ou_w,
__private const int ou_h) {
const int ou_ch_blk_id = get_global_id(0);
const int ou_w_blk_id = get_global_id(1);
const int ou_nh_id = get_global_id(2);
const int w_blk_size = 2;
const int batch_id = ou_nh_id / ou_h;
int ou_col_id = ou_w_blk_id * w_blk_size;
int ou_row_id = ou_nh_id % ou_h;
int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
// input pos in one block and on batch
int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
#ifdef BIASE_CH
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
half4 output[2] = {0.0f};
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int pad, __private const int dilation,
__private const int in_ch, __private const int in_w, /* of one block */
__private const int in_h, /* of one block */
__private const int ou_w, __private const int ou_h) {
half4 inputs[12];
const int ou_ch_blk_id = get_global_id(0);
const int ou_w_blk_id = get_global_id(1);
const int ou_nh_id = get_global_id(2);
const int w_blk_size = 2;
int filter_x = ou_ch_blk_id * 3;
int filter_y = 0;
half4 filters[9];
filters[0] = read_imageh(filter, sampler,(int2)(filter_x,filter_y));
filters[1] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y));
filters[2] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y));
const int batch_id = ou_nh_id / ou_h;
int ou_col_id = ou_w_blk_id * w_blk_size;
int ou_row_id = ou_nh_id % ou_h;
int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id);
// input pos in one block and on batch
int col_id = ou_col_id - pad;
int row_id = ou_row_id - pad;
int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
output[0] = mad(inputs[0], filters[0], output[0]);
output[1] = mad(inputs[1], filters[0], output[1]);
#ifdef BIASE_CH
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
output[1] = output[0];
#elif defined(BIASE_ELE)
half4 output[2];
output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
if (ou_col_id + 1 < ou_w) {
output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
}
#else
half4 output[2] = {0.0f};
#endif
output[0] = mad(inputs[1], filters[1], output[0]);
output[1] = mad(inputs[2], filters[1], output[1]);
half4 inputs[12];
output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]);
int filter_x = ou_ch_blk_id * 3;
int filter_y = 0;
half4 filters[9];
filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
int in_x = mad24(ou_ch_blk_id, in_w, col_id);
int in_y = mad24(batch_id, in_h, row_id);
filters[3] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 1));
filters[4] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 1));
filters[5] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 1));
int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
output[0] = mad(inputs[0], filters[0], output[0]);
output[1] = mad(inputs[1], filters[0], output[1]);
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[1], filters[1], output[0]);
output[1] = mad(inputs[2], filters[1], output[1]);
output[0] = mad(inputs[2], filters[2], output[0]);
output[1] = mad(inputs[3], filters[2], output[1]);
output[0] = mad(inputs[4], filters[3], output[0]);
output[1] = mad(inputs[5], filters[3], output[1]);
filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
output[0] = mad(inputs[5], filters[4], output[0]);
output[1] = mad(inputs[6], filters[4], output[1]);
int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
output[0] = mad(inputs[6], filters[5
], output[0]);
output[1] = mad(inputs[7], filters[5
], output[1]);
output[0] = mad(inputs[4], filters[3
], output[0]);
output[1] = mad(inputs[5], filters[3
], output[1]);
output[0] = mad(inputs[5], filters[4], output[0]);
output[1] = mad(inputs[6], filters[4], output[1]);
filters[6] = read_imageh(filter, sampler,(int2)(filter_x,filter_y + 2));
filters[7] = read_imageh(filter, sampler,(int2)(filter_x + 1,filter_y + 2));
filters[8] = read_imageh(filter, sampler,(int2)(filter_x + 2,filter_y + 2));
output[0] = mad(inputs[6], filters[5], output[0]);
output[1] = mad(inputs[7], filters[5], output[1]);
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]);
output[0] = mad(inputs[8], filters[6], output[0]);
output[1] = mad(inputs[9], filters[6], output[1]);
output[0] = mad(inputs[9], filters[7], output[0]);
output[1] = mad(inputs[10], filters[7], output[1]);
output[0] = mad(inputs[9], filters[7], output[0]);
output[1] = mad(inputs[10], filters[7], output[1]);
output[0] = mad(inputs[10], filters[8], output[0]);
output[1] = mad(inputs[11], filters[8], output[1]);
output[0] = mad(inputs[10], filters[8], output[0]);
output[1] = mad(inputs[11], filters[8], output[1]);
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
output[0] = mad(scale, output[0], biase);
if (ou_col_id + 1 < ou_w) {
output[1] = mad(scale, output[1], biase);
}
half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
output[0] = mad(scale, output[0], biase);
if (ou_col_id + 1 < ou_w) {
output[1] = mad(scale, output[1], biase);
}
#endif
#ifdef RELU
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[0] = activation(output[0]);
output[1] = activation(output[1]);
#endif
write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) {
write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
}
write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
if (ou_col_id + 1 < ou_w) {
write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
}
}
__kernel void conv_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__kernel void conv_1x1(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
half4 output = 0.0f;
#endif
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
}
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
...
...
@@ -1017,14 +1138,12 @@ __kernel void conv_1x1_simple(
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int offset, __private const int input_c,
__private const int
input_c_origin, __private const int
dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
__private const int output_width, __private const int output_height,
__private const int old_w) {
half zero = 0.0f;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
...
...
@@ -1035,7 +1154,7 @@ __kernel void conv_1x1_simple(
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c
, old_w);
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
...
...
@@ -1064,14 +1183,14 @@ __kernel void conv_1x1_simple(
#ifdef BIASE_CH
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
#elif defined(BIASE_ELE)
half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
half4 output1 = output0;
half4 output2 = output0;
half4 output3 = output0;
#else
half4 output0 = 0.0f;
...
...
@@ -1082,7 +1201,8 @@ __kernel void conv_1x1_simple(
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
...
...
@@ -1095,7 +1215,8 @@ __kernel void conv_1x1_simple(
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
output1 = mad(input1.x, weight0, output1);
...
...
@@ -1104,7 +1225,8 @@ __kernel void conv_1x1_simple(
output1 = mad(input1.w, weight3, output1);
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
output2 = mad(input2.x, weight0, output2);
...
...
@@ -1113,7 +1235,8 @@ __kernel void conv_1x1_simple(
output2 = mad(input2.w, weight3, output2);
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
output3 = mad(input3.x, weight0, output3);
...
...
@@ -1124,38 +1247,38 @@ __kernel void conv_1x1_simple(
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
#endif
if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0);
}
if (out_w1 < old_w){
if (out_w1 < old_w)
{
write_imageh(output_image, output_pos1, output1);
}
if (out_w2 < old_w){
if (out_w2 < old_w)
{
write_imageh(output_image, output_pos2, output2);
}
if (out_w3 < old_w){
if (out_w3 < old_w)
{
write_imageh(output_image, output_pos3, output3);
}
}
...
...
@@ -1170,14 +1293,12 @@ __kernel void conv_1x1_wrapped(
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int offset, __private const int input_c,
__private const int
input_c_origin, __private const int
dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
__private const int output_width, __private const int output_height,
__private const int old_w) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
...
...
@@ -1188,7 +1309,7 @@ __kernel void conv_1x1_wrapped(
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c
, old_w);
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
...
...
@@ -1216,15 +1337,15 @@ __kernel void conv_1x1_wrapped(
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = read_imageh(bias, sampler, output_pos1);
half4 output2 = read_imageh(bias, sampler, output_pos2);
half4 output3 = read_imageh(bias, sampler, output_pos3);
half4 output0 = read_imageh(bias, sampler, output_pos0);
half4 output1 = read_imageh(bias, sampler, output_pos1);
half4 output2 = read_imageh(bias, sampler, output_pos2);
half4 output3 = read_imageh(bias, sampler, output_pos3);
#else
half4 output0 = 0.0f;
...
...
@@ -1237,7 +1358,8 @@ __kernel void conv_1x1_wrapped(
int burndary_index = input_c * 4 - input_c_origin;
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
...
...
@@ -1245,30 +1367,31 @@ __kernel void conv_1x1_wrapped(
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){
if (burndary_index==0){
if ((max_w_bound - pos_in.x - 1) < input_width &&
(max_w_bound - pos_in.x - 1) >= 0) {
if (burndary_index == 0) {
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);
} else if (burndary_index
==1)
{
} else if (burndary_index
== 1)
{
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(0.0f, weight3, output0);
} else if (burndary_index
==2)
{
} else if (burndary_index
== 2)
{
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(0.0f, weight2, output0);
output0 = mad(0.0f, weight3, output0);
} else if (burndary_index
==3)
{
} else if (burndary_index
== 3)
{
output0 = mad(input0.x, weight0, output0);
output0 = mad(0.0f, weight1, output0);
output0 = mad(0.0f, weight2, output0);
output0 = mad(0.0f, weight3, output0);
}
}else {
}
else {
output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
...
...
@@ -1276,33 +1399,34 @@ __kernel void conv_1x1_wrapped(
}
// -------------1--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){
if (burndary_index
==0)
{
if (abs(max_w_bound - pos_in.x) < input_width)
{
if (burndary_index
== 0)
{
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
} else if (burndary_index
==1)
{
} else if (burndary_index
== 1)
{
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(0.0f, weight3, output1);
} else if (burndary_index
==2)
{
} else if (burndary_index
== 2)
{
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(0.0f, weight2, output1);
output1 = mad(0.0f, weight3, output1);
} else if (burndary_index
==3)
{
} else if (burndary_index
== 3)
{
output1 = mad(input1.x, weight0, output1);
output1 = mad(0.0f, weight1, output1);
output1 = mad(0.0f, weight2, output1);
output1 = mad(0.0f, weight3, output1);
}
}else {
}
else {
output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
...
...
@@ -1310,33 +1434,34 @@ __kernel void conv_1x1_wrapped(
}
// -------------2--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){
if (burndary_index
==0)
{
if (abs(max_w_bound - pos_in.x) < input_width)
{
if (burndary_index
== 0)
{
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(input2.w, weight3, output2);
} else if (burndary_index
==1)
{
} else if (burndary_index
== 1)
{
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
output2 = mad(0.0f, weight3, output2);
} else if (burndary_index
==2)
{
} else if (burndary_index
== 2)
{
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(0.0f, weight2, output2);
output2 = mad(0.0f, weight3, output2);
} else if (burndary_index
==3)
{
} else if (burndary_index
== 3)
{
output2 = mad(input2.x, weight0, output2);
output2 = mad(0.0f, weight1, output2);
output2 = mad(0.0f, weight2, output2);
output2 = mad(0.0f, weight3, output2);
}
}else {
}
else {
output2 = mad(input2.x, weight0, output2);
output2 = mad(input2.y, weight1, output2);
output2 = mad(input2.z, weight2, output2);
...
...
@@ -1344,33 +1469,34 @@ __kernel void conv_1x1_wrapped(
}
// -------------3--------------
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
if (abs(max_w_bound - pos_in.x) < input_width){
if (burndary_index
==0)
{
if (abs(max_w_bound - pos_in.x) < input_width)
{
if (burndary_index
== 0)
{
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(input3.w, weight3, output3);
} else if (burndary_index
==1)
{
} else if (burndary_index
== 1)
{
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
output3 = mad(0.0f, weight3, output3);
} else if (burndary_index
==2)
{
} else if (burndary_index
== 2)
{
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(0.0f, weight2, output3);
output3 = mad(0.0f, weight3, output3);
} else if (burndary_index
==3)
{
} else if (burndary_index
== 3)
{
output3 = mad(input3.x, weight0, output3);
output3 = mad(0.0f, weight1, output3);
output3 = mad(0.0f, weight2, output3);
output3 = mad(0.0f, weight3, output3);
}
}else {
}
else {
output3 = mad(input3.x, weight0, output3);
output3 = mad(input3.y, weight1, output3);
output3 = mad(input3.z, weight2, output3);
...
...
@@ -1379,1015 +1505,1060 @@ __kernel void conv_1x1_wrapped(
}
#ifdef BATCH_NORM
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
output0 = activation(output0);
output1 = activation(output1);
output2 = activation(output2);
output3 = activation(output3);
#endif
if (out_w0 < old_w) {
write_imageh(output_image, output_pos0, output0);
}
if (out_w1 < old_w){
if (out_w1 < old_w)
{
write_imageh(output_image, output_pos1, output1);
}
if (out_w2 < old_w){
if (out_w2 < old_w)
{
write_imageh(output_image, output_pos2, output2);
}
if (out_w3 < old_w){
if (out_w3 < old_w)
{
write_imageh(output_image, output_pos3, output3);
}
}
__kernel void conv_7x7(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_7x7(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block
;
ouput_pos_in_one_block.x = out_w
;
ouput_pos_in_one_block.y = out_nh
;
int2 stride_xy
;
stride_xy.x = stride
;
stride_xy.y = stride
;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int j = 0; j < 7; j++){
for(int k = 0; k < 7; k++){
input = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for (int j = 0; j < 7; j++) {
for (int k = 0; k < 7; k++) {
input = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void conv_7x7Pt1x2(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_7x7Pt1x2(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w1 = get_global_id(1);
const int out_nh = get_global_id(2);
if (out_c >= global_size_dim0 ||
out_w1 >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int out_w = out_w1 * 2;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w1 = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const int out_w = out_w1 * 2;
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
const int filter_n0 = 4 * out_c + 0;
const int filter_n1 = 4 * out_c + 1;
const int filter_n2 = 4 * out_c + 2;
const int filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block
;
ouput_pos_in_one_block.x = out_w
;
ouput_pos_in_one_block.y = out_nh
;
int2 stride_xy
;
stride_xy.x = stride
;
stride_xy.y = stride
;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
half4 output0 = 0.0f;
half4 output1 = 0.0f;
half4 output0 = 0.0f;
half4 output1 = 0.0f;
#ifdef BIASE_CH
output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
output1 = output0;
output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
output1 = output0;
#elif defined(BIASE_ELE)
output0 = read_imageh(bias, sampler, output_pos);
output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
output0 = read_imageh(bias, sampler, output_pos);
output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
#else
output0 = 0.0f;
output1 = 0.0f;
#endif
half4 input[8];
half4 filter0[4];
half4 filter1[4];
half4 filter2[4];
half4 filter3[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int k = 0; k < 7; k++){
for (int j = 0; j < 8; j++) {
input[j] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
if (j < 7) {
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
output0.x += dot(input[j], filter0[0]);
output0.y += dot(input[j], filter0[1]);
output0.z += dot(input[j], filter0[2]);
output0.w += dot(input[j], filter0[3]);
}
if (j > 0) {
output1.x += dot(input[j], filter1[0]);
output1.y += dot(input[j], filter1[1]);
output1.z += dot(input[j], filter1[2]);
output1.w += dot(input[j], filter1[3]);
}
filter1[0] = filter0[0];
filter1[1] = filter0[1];
filter1[2] = filter0[2];
filter1[3] = filter0[3];
}
output0 = 0.0f;
output1 = 0.0f;
#endif
half4 input[8];
half4 filter0[4];
half4 filter1[4];
half4 filter2[4];
half4 filter3[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for (int k = 0; k < 7; k++) {
for (int j = 0; j < 8; j++) {
input[j] = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 3) * dilation,
pos_in.y + (k - 3) * dilation)),
(half4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
if (j < 7) {
filter_pos0.x = filter_c * 7 + filter_w;
filter_pos0.y = filter_n0 * 7 + filter_h;
filter_pos1.x = filter_c * 7 + filter_w;
filter_pos1.y = filter_n1 * 7 + filter_h;
filter_pos2.x = filter_c * 7 + filter_w;
filter_pos2.y = filter_n2 * 7 + filter_h;
filter_pos3.x = filter_c * 7 + filter_w;
filter_pos3.y = filter_n3 * 7 + filter_h;
filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
output0.x += dot(input[j], filter0[0]);
output0.y += dot(input[j], filter0[1]);
output0.z += dot(input[j], filter0[2]);
output0.w += dot(input[j], filter0[3]);
}
if (j > 0) {
output1.x += dot(input[j], filter1[0]);
output1.y += dot(input[j], filter1[1]);
output1.z += dot(input[j], filter1[2]);
output1.w += dot(input[j], filter1[3]);
}
}
filter1[0] = filter0[0];
filter1[1] = filter0[1];
filter1[2] = filter0[2];
filter1[3] = filter0[3];
}
}
}
#ifdef BATCH_NORM
half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * s + b;
output1 = output1 * s + b;
half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
output0 = output0 * s + b;
output1 = output1 * s + b;
#endif
#ifdef RELU
output0 = activation(output0);
output1 = activation(output1);
output0 = activation(output0);
output1 = activation(output1);
#endif
write_imageh(output_image, output_pos, output0);
if ((output_pos.x + 1) % output_width != 0) {
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
}
write_imageh(output_image, output_pos, output0);
if ((output_pos.x + 1) % output_width != 0) {
write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
}
}
// dilation == 1
__kernel void conv_7x7spl(__private const int item_ch,
__private const int item_w,
__private const int item_h,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_7x7spl(
__private const int item_ch, __private const int item_w,
__private const int item_h, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int pad,
__private const int dilation,
__private const int in_ch,
__private const int in_w,
__private const int in_h,
__private const int out_w,
__private const int out_h) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
// filter
const int filter_w = 7;
const int filter_h = 7;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int pad, __private const int dilation,
__private const int in_ch, __private const int in_w,
__private const int in_h, __private const int out_w,
__private const int out_h) {
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
// filter
const int filter_w = 7;
const int filter_h = 7;
// item_id
const int item_ch_id = get_global_id(0);
const int item_w_id = get_global_id(1);
const int item_h_id = get_global_id(2);
// out_width_id_per_blk and out_batch_id
int out_batch_id = item_h_id / in_h;
int out_w_base_id = item_ch_id * out_w;
int out_w_id0 = item_w_id;
int out_w_id1 = out_w_id0 + item_w;
int out_w_id2 = out_w_id1 + item_w;
int out_w_id3 = out_w_id2 + item_w;
int out_w_id4 = out_w_id3 + item_w;
// in_width_id_per_blk and in_height_id_per_batch
int in_h_id = (item_h_id % out_h) * stride - pad;
int in_w_id0 = item_w_id * stride - pad;
int in_w_id1 = in_w_id0 + item_w * stride;
int in_w_id2 = in_w_id1 + item_w * stride;
int in_w_id3 = in_w_id2 + item_w * stride;
int in_w_id4 = in_w_id3 + item_w * stride;
#ifdef BIASE_CH
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
output[1] = output[0];
output[2] = output[0];
output[3] = output[0];
output[4] = output[0];
#elif defined(BIASE_ELE)
half4 output[5];
output[0] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id4, item_h_id));
}
half4 output[5];
output[0] =
read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
if (out_w_id1 < out_w) {
output[1] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id1, item_h_id));
}
if (out_w_id2 < out_w) {
output[2] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id2, item_h_id));
}
if (out_w_id3 < out_w) {
output[3] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id3, item_h_id));
}
if (out_w_id4 < out_w) {
output[4] = read_imageh(bias, sampler,
(int2)(out_w_base_id + out_w_id4, item_h_id));
}
#else
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(filter_image, sampler,(int2)(filter_w_val + w,filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w); // in_ch:3,out_ch:0-3
input[0] = read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] = read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
input[2] = read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
input[3] = read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] = read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
output[0] = mad(input[0].x, filter_trans[0], output[0]);
output[1] = mad(input[1].x, filter_trans[0], output[1]);
output[2] = mad(input[2].x, filter_trans[0], output[2]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0]);
output[1] = mad(input[1].y, filter_trans[1], output[1]
);
output[2] = mad(input[2].y, filter_trans[1], output[2]);
output[3] = mad(input[3].y, filter_trans[1], output[3]
);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]
);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2
]);
output[3] = mad(input[3].z, filter_trans[2], output[3
]);
output[4] = mad(input[4].z, filter_trans[2], output[4
]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2
]);
output[3] = mad(input[3].w, filter_trans[3], output[3
]);
output[4] = mad(input[4].w, filter_trans[3], output[4
]);
}
}
half4 output[5] = {0.0f};
#endif
half4 filter[4] = {0.0f};
half4 filter_trans[4] = {0.0f};
half4 input[5] = {0.0f};
int filter_h_val0 = item_ch_id * 4 * filter_h;
int filter_h_val1 = filter_h_val0 + filter_h;
int filter_h_val2 = filter_h_val1 + filter_h;
int filter_h_val3 = filter_h_val2 + filter_h;
for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
const int in_w_base_id = mul24(ch, in_w);
int filter_w_val = ch * filter_w;
for (int h = 0; h < filter_h; h++) {
int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
(out_batch_id * in_h + in_h_id + h < 0 ||
out_batch_id * in_h + in_h_id + h >= in_h));
for (int w = 0; w < filter_w; w++) {
int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
(in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
(in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
(in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
(in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
(in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
filter[0] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val0 + h)); // in_ch:0-3,out_ch:0
filter[1] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val1 + h)); // in_ch:0-3,out_ch:1
filter[2] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val2 + h)); // in_ch:0-3,out_ch:2
filter[3] = read_imageh(
filter_image, sampler,
(int2)(filter_w_val + w, filter_h_val3 + h)); // in_ch:0-3,out_ch:3
filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
filter[3].x); // in_ch:0,out_ch:0-3
filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
filter[3].y); // in_ch:1,out_ch:0-3
filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
filter[3].z); // in_ch:2,out_ch:0-3
filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
filter[3].w); // in_ch:3,out_ch:0-3
input[0] =
read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
input[1] =
read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val)
);
input[2] =
read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val)
);
input[3] =
read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
input[4] =
read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val)
);
output[0] = mad(input[0].x, filter_trans[0], output[0
]);
output[1] = mad(input[1].x, filter_trans[0], output[1
]);
output[2] = mad(input[2].x, filter_trans[0], output[2
]);
output[3] = mad(input[3].x, filter_trans[0], output[3]);
output[4] = mad(input[4].x, filter_trans[0], output[4]);
if (ch_surplus < 3) {
output[0] = mad(input[0].y, filter_trans[1], output[0
]);
output[1] = mad(input[1].y, filter_trans[1], output[1
]);
output[2] = mad(input[2].y, filter_trans[1], output[2
]);
output[3] = mad(input[3].y, filter_trans[1], output[3]);
output[4] = mad(input[4].y, filter_trans[1], output[4]);
}
if (ch_surplus < 2) {
output[0] = mad(input[0].z, filter_trans[2], output[0]);
output[1] = mad(input[1].z, filter_trans[2], output[1]);
output[2] = mad(input[2].z, filter_trans[2], output[2]);
output[3] = mad(input[3].z, filter_trans[2], output[3]);
output[4] = mad(input[4].z, filter_trans[2], output[4]);
}
if (ch_surplus < 1) {
output[0] = mad(input[0].w, filter_trans[3], output[0]);
output[1] = mad(input[1].w, filter_trans[3], output[1]);
output[2] = mad(input[2].w, filter_trans[3], output[2]);
output[3] = mad(input[3].w, filter_trans[3], output[3]);
output[4] = mad(input[4].w, filter_trans[3], output[4]);
}
}
}
}
#ifdef BATCH_NORM
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] =
mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] =
mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] =
mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] =
mad(scale, output[4], biase);
}
half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
output[0] = mad(scale, output[0], biase);
if (out_w_id1 < out_w) {
output[1] =
mad(scale, output[1], biase);
}
if (out_w_id2 < out_w) {
output[2] =
mad(scale, output[2], biase);
}
if (out_w_id3 < out_w) {
output[3] =
mad(scale, output[3], biase);
}
if (out_w_id4 < out_w) {
output[4] =
mad(scale, output[4], biase);
}
#endif
#ifdef RELU
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id), output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id), output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id), output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id), output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id), output[4]);
}
output[0] = activation(output[0]);
output[1] = activation(output[1]);
output[2] = activation(output[2]);
output[3] = activation(output[3]);
output[4] = activation(output[4]);
#endif
write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
output[0]);
if (out_w_id1 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
output[1]);
}
if (out_w_id2 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
output[2]);
}
if (out_w_id3 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
output[3]);
}
if (out_w_id4 < out_w) {
write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
output[4]);
}
}
__kernel void conv_5x5(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter_image,
__kernel void conv_5x5(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter_image,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
const filter_n0 = 4 * out_c + 0;
const filter_n1 = 4 * out_c + 1;
const filter_n2 = 4 * out_c + 2;
const filter_n3 = 4 * out_c + 3;
int2 ouput_pos_in_one_block
;
ouput_pos_in_one_block.x = out_w
;
ouput_pos_in_one_block.y = out_nh
;
int2 stride_xy
;
stride_xy.x = stride
;
stride_xy.y = stride
;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for(int j = 0; j < 5; j++){
for(int k = 0; k < 5; k++){
input = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 2) * dilation, pos_in.y + (k - 2) * dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter_pos1.y = filter_n1 * 5 + filter_h;
filter_pos2.x = filter_c * 5 + filter_w;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos3.x = filter_c * 5 + filter_w;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
half4 output = 0.0f;
#endif
half4 input;
half4 filter[4];
int2 filter_pos0;
int2 filter_pos1;
int2 filter_pos2;
int2 filter_pos3;
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
for (int j = 0; j < 5; j++) {
for (int k = 0; k < 5; k++) {
input = select(
read_imageh(input_image, sampler,
(int2)(pos_in.x + (j - 2) * dilation,
pos_in.y + (k - 2) * dilation)),
(half4)(0.0f),
(ushort4)(
(in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
<< 15));
int filter_h = k;
int filter_w = j;
int filter_c = i;
filter_pos0.x = filter_c * 5 + filter_w;
filter_pos0.y = filter_n0 * 5 + filter_h;
filter_pos1.x = filter_c * 5 + filter_w;
filter_pos1.y = filter_n1 * 5 + filter_h;
filter_pos2.x = filter_c * 5 + filter_w;
filter_pos2.y = filter_n2 * 5 + filter_h;
filter_pos3.x = filter_c * 5 + filter_w;
filter_pos3.y = filter_n3 * 5 + filter_h;
filter[0] = read_imageh(filter_image, sampler, filter_pos0);
filter[1] = read_imageh(filter_image, sampler, filter_pos1);
filter[2] = read_imageh(filter_image, sampler, filter_pos2);
filter[3] = read_imageh(filter_image, sampler, filter_pos3);
output.x += dot(input, filter[0]);
output.y += dot(input, filter[1]);
output.z += dot(input, filter[2]);
output.w += dot(input, filter[3]);
}
}
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void convBNAdd_3x3(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__kernel void convBNAdd_3x3(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
if (out_c >= global_size_dim0 ||
out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
half4 output = (half4)0.0f;
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
half4 input[9]
;
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh)
;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
out_nh >= global_size_dim2) {
return;
}
input[1] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
int2 stride_xy;
stride_xy.x = stride;
stride_xy.y = stride;
input[2] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
int2 ouput_pos_in_one_block;
ouput_pos_in_one_block.x = out_w;
ouput_pos_in_one_block.y = out_nh;
input[3] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
input[4] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
int2 in_pos_in_one_block;
in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
input[5] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
half4 output = (half4)0.0f;
input[6] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
half4 input[9];
input[7] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
input[0] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[1] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[2] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y - dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y - dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y - dilation >= input_height)
<< 15));
input[3] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[4] = select(
read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[5] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y >= input_height)
<< 15));
input[6] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x - dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x - dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x - dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[7] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] =
select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 ||
in_pos_in_one_block.y + dilation < 0 ||
in_pos_in_one_block.x + dilation >= input_width ||
in_pos_in_one_block.y + dilation >= input_height)
<< 15));
input[8] = select(read_imageh(input_image, sampler,
(int2)(pos_in.x + dilation, pos_in.y + dilation)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
/*
for (int j = 0; j < 9; ++j) {
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
}
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
*/
int j = 0;
int2 pos_of_weight;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 1;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 2;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 3;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 4;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 5;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 6;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 7;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
j = 8;
pos_of_weight.x = i * 3 + j % 3;
pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
weight_x = read_imageh(filter, sampler, pos_of_weight);
output.x += dot(input[j], weight_x);
pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
weight_y = read_imageh(filter, sampler, pos_of_weight);
output.y += dot(input[j], weight_y);
pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
weight_z = read_imageh(filter, sampler, pos_of_weight);
output.z += dot(input[j], weight_z);
pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
weight_w = read_imageh(filter, sampler, pos_of_weight);
output.w += dot(input[j], weight_w);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c, 0));
output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos);
output += read_imageh(bias, sampler, output_pos);
#endif
#ifdef RELU
output = activation(output);
output = activation(output);
#endif
write_imageh(output_image, output_pos, output);
write_imageh(output_image, output_pos, output);
}
__kernel void convBNAdd_1x1(__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__kernel void convBNAdd_1x1(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale,
__read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int dilation,
__private const int input_width,/* of one block */
__private const int input_height,/* of one block */
__private const int output_width,
__private const int output_height) {
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
const uint kernelHXW = 1;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
half4 output = 0.0f;
for (int i = 0; i < input_c; ++i) {
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
for (int i = 0; i < input_c; ++i) {
int2 pos_in =
(int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
half4 input = read_imageh(input_image, sampler, pos_in);
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
/*
output.x = dot(input, weight0);
output.y = dot(input, weight1);
output.z = dot(input, weight2);
output.w = dot(input, weight3);
*/
}
output = mad(input.x, weight0, output);
output = mad(input.y, weight1, output);
output = mad(input.z, weight2, output);
output = mad(input.w, weight3, output);
}
#ifdef BATCH_NORM
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE_CH
output += read_imageh(bias, sampler, (int2)(out_c, 0));
output += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output += read_imageh(bias, sampler, output_pos);
output += read_imageh(bias, sampler, output_pos);
#endif
#ifdef RELU
...
...
@@ -2398,24 +2569,22 @@ __kernel void convBNAdd_1x1(__private const int global_size_dim0,
}
__kernel void convBNAdd_1x1_spl(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w
) {
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int old_w) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
...
...
@@ -2426,33 +2595,32 @@ __kernel void convBNAdd_1x1_spl(
int out_w2 = out_w + global_size_dim1 * 2;
int out_w3 = out_w + global_size_dim1 * 3;
int outpos_main = mul24(out_c
, old_w);
int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
|
CLK_FILTER_NEAREST
;
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
int2 in_pos_in_one_block0 =
ouput_pos_in_one_block0
*
stride_xy
+
(
int2
)(
offset,
offset
)
;
ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
int2 in_pos_in_one_block1 =
ouput_pos_in_one_block1
*
stride_xy
+
(
int2
)(
offset,
offset
)
;
ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
int2 in_pos_in_one_block2 =
ouput_pos_in_one_block2
*
stride_xy
+
(
int2
)(
offset,
offset
)
;
ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
int2 in_pos_in_one_block3 =
ouput_pos_in_one_block3
*
stride_xy
+
(
int2
)(
offset,
offset
)
;
ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
half4 output0 = 0.0f;
half4 output1 = 0.0f;
...
...
@@ -2461,7 +2629,8 @@ __kernel void convBNAdd_1x1_spl(
for (int i = 0; i < input_c; ++i) {
// ------------0---------------
int2
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block0.x,
in_pos_in_one_block0.y
)
;
int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
in_pos_in_one_block0.y);
half4 input0 = read_imageh(input_image, sampler, pos_in);
half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
...
...
@@ -2475,7 +2644,8 @@ __kernel void convBNAdd_1x1_spl(
output0 = mad(input0.w, weight3, output0);
// -------------1--------------
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block1.x,
in_pos_in_one_block1.y
)
;
pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
in_pos_in_one_block1.y);
half4 input1 = read_imageh(input_image, sampler, pos_in);
//
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
...
...
@@ -2490,7 +2660,8 @@ __kernel void convBNAdd_1x1_spl(
output1 = mad(input1.w, weight3, output1);
// -------------2--------------
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block2.x,
in_pos_in_one_block2.y
)
;
pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
in_pos_in_one_block2.y);
half4 input2 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
...
...
@@ -2505,7 +2676,8 @@ __kernel void convBNAdd_1x1_spl(
output2 = mad(input2.w, weight3, output2);
// -------------3--------------
pos_in
=
(
int2
)(
i
*
input_width
+
in_pos_in_one_block3.x,
in_pos_in_one_block3.y
)
;
pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
in_pos_in_one_block3.y);
half4 input3 = read_imageh(input_image, sampler, pos_in);
// half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
...
...
@@ -2521,29 +2693,29 @@ __kernel void convBNAdd_1x1_spl(
}
#ifdef BATCH_NORM
output0
=
output0
*
read_imageh
(
new_scale,
sampler,
(
int2
)(
out_c,
0
))
+
read_imageh
(
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output1
=
output1
*
read_imageh
(
new_scale,
sampler,
(
int2
)(
out_c,
0
))
+
read_imageh
(
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output2
=
output2
*
read_imageh
(
new_scale,
sampler,
(
int2
)(
out_c,
0
))
+
read_imageh
(
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
output3
=
output3
*
read_imageh
(
new_scale,
sampler,
(
int2
)(
out_c,
0
))
+
read_imageh
(
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
read_imageh(new_biase, sampler, (int2)(out_c, 0));
#endif
#ifdef BIASE_CH
output0
+=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
output1
+=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
output2
+=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
output3
+=
read_imageh
(
bias,
sampler,
(
int2
)(
out_c,
0
))
;
output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
output0
+=
read_imageh
(
bias,
sampler,
output_pos0
)
;
output1
+=
read_imageh
(
bias,
sampler,
output_pos1
)
;
output2
+=
read_imageh
(
bias,
sampler,
output_pos2
)
;
output3
+=
read_imageh
(
bias,
sampler,
output_pos3
)
;
output0 += read_imageh(bias, sampler, output_pos0);
output1 += read_imageh(bias, sampler, output_pos1);
output2 += read_imageh(bias, sampler, output_pos2);
output3 += read_imageh(bias, sampler, output_pos3);
#endif
#ifdef RELU
...
...
@@ -2557,22 +2729,108 @@ __kernel void convBNAdd_1x1_spl(
write_imageh(output_image, output_pos0, output0);
}
if
(
out_w1
<
old_w
)
{
if (out_w1 < old_w)
{
write_imageh(output_image, output_pos1, output1);
}
if
(
out_w2
<
old_w
)
{
if (out_w2 < old_w)
{
write_imageh(output_image, output_pos2, output2);
}
if
(
out_w3
<
old_w
)
{
if (out_w3 < old_w)
{
write_imageh(output_image, output_pos3, output3);
}
}
__kernel void depth_conv(
__private const int global_size_dim0, __private const int global_size_dim1,
__private const int global_size_dim2, __read_only image2d_t input,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias,
#endif
#ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_biase,
#endif
__write_only image2d_t output_image, __private const int stride,
__private const int offset, __private const int input_c,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width, __private const int output_height,
__private const int filter_width, __private const int filter_height) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP
| CLK_FILTER_NEAREST;
const int batch_index = out_nh / output_height;
const int out_nh_in_one_batch = out_nh % output_height;
int2 stride_xy = (int2)(stride, stride);
int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
int2 in_pos_in_one_block =
ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
#ifdef BIASE_CH
half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
#elif defined(BIASE_ELE)
half4 output = read_imageh(bias, sampler, output_pos);
#else
half4 output = 0.0f;
#endif
int2 pos_in_input_block =
(int2)(out_c * input_width, batch_index * input_height);
int2 pos_in_filter_block =
(int2)(out_c * filter_width, batch_index * filter_height);
int filter_x = pos_in_filter_block.x;
int filter_y = pos_in_filter_block.y;
int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
int2 align = {filter_width / 2, filter_height / 2};
/* if (output_pos.x == 0 && output_pos.y == 0){
printf("align.x=%d align.y=%d \n ",align.x,align.y);
printf("stride=%d \n ",stride);
}*/
for (int fy = 0; fy < filter_height; ++fy) {
for (int fx = 0; fx < filter_width; ++fx) {
int x_off = fx - align.x;
int y_off = fy - align.y;
/* if (output_pos.x == 0 && output_pos.y == 0){
printf("fx=%d fy=%d \n ",fx,fy);
printf("x_off=%d y_off=%d \n ",x_off,y_off);
}*/
half4 in = select(
read_imageh(input, sampler,
(int2)(input_x_base + x_off, input_y_base + y_off)),
(half4)(0.0f),
(ushort4)((in_pos_in_one_block.x + x_off < 0 ||
in_pos_in_one_block.y + y_off < 0 ||
in_pos_in_one_block.x + x_off >= input_width |
|
in_pos_in_one_block.y
+
y_off
>=
input_height
)
<<
15
))
;
half4
f
=
read_imageh
(
filter,
sampler,
(
int2
)(
filter_x
+
fx,
filter_y
+
fy
))
;
output
+=
in
*
f
;
/*if
(
output_pos.x
==111
&&
output_pos.y
==
0
)
{
printf
(
"in={ %f , %f , %f , %f } \n
"
,
convert_float
(
in.x
)
,
convert_float
(
in.y
)
,
convert_float
(
in.z
)
,
convert_float
(
in.w
))
;
printf
(
"filter={ %f , %f , %f , %f } \n
"
,
convert_float
(
f.x
)
,
convert_float
(
f.y
)
,
convert_float
(
f.z
)
,
convert_float
(
f.w
))
;
printf
(
"output={ %f , %f , %f , %f } \n
"
,
convert_float
(
output.x
)
,
convert_float
(
output.y
)
,
convert_float
(
output.z
)
,
convert_float
(
output.w
))
;
}*/
}
}
#
ifdef
BATCH_NORM
output
=
output
*
read_imageh
(
new_scale,
sampler,
(
int2
)(
out_c,
0
))
+
read_imageh
(
new_biase,
sampler,
(
int2
)(
out_c,
0
))
;
#
endif
#
ifdef
RELU
output
=
activation
(
output
)
;
#
endif
write_imageh
(
output_image,
output_pos,
output
)
;
}
\ No newline at end of file
mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
浏览文件 @
6554854a
...
...
@@ -13,33 +13,101 @@ See the License for the specific language governing permissions and
limitations
under
the
License.
*/
#
pragma
OPENCL
EXTENSION
cl_khr_fp16
:
enable
__kernel
void
elementwise_mul
(
__global
image2d_t
input,
__global
image2d_t
bias,__write_only
image2d_t
outputImage
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
half4
in
=
read_imageh
(
input,
sampler,
coords
)
;
half4
biase
=
read_imageh
(
bias,
sampler,
coords
)
;
half4
output
=
in
*
biase
;
write_imageh
(
outputImage,coords,output
)
;
}
__kernel
void
channel_mul
(
__global
image2d_t
input,
__global
image2d_t
bias,__write_only
image2d_t
outputImage,
int
w
)
{
__kernel
void
elementwise_mul
(
__global
image2d_t
input,
__global
image2d_t
bias,
__write_only
image2d_t
outputImage
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
half4
in
=
read_imageh
(
input,
sampler,
coords
)
;
half4
biase
=
read_imageh
(
bias,
sampler,
coords
)
;
half4
output
=
in
*
biase
;
write_imageh
(
outputImage,
coords,
output
)
;
}
__kernel
void
channel_mul
(
__global
image2d_t
input,
__global
image2d_t
bias,
__write_only
image2d_t
outputImage,
int
w
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
int2
coords_bias
;
coords_bias.x
=
x
/
w
;
coords_bias.x
=
x
/
w
;
coords_bias.y
=
0
;
half4
in
=
read_imageh
(
input,
sampler,
coords
)
;
half4
biase
=
read_imageh
(
bias,
sampler,
coords_bias
)
;
half4
output
=
in
*
biase
;
write_imageh
(
outputImage,
coords,
output
)
;
write_imageh
(
outputImage,
coords,
output
)
;
}
//
etc
:
1
1
1
72
//
run
time
Y
[value,0,0,0]
*
72
__kernel
void
channel_mul_d2
(
__global
image2d_t
input,
__global
image2d_t
bias,
__write_only
image2d_t
outputImage,
int
w
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
const
sampler_t
sampler
=
CLK_NORMALIZED_COORDS_TRUE
| CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST
;
int2
coords
;
coords.x
=
x
;
coords.y
=
y
;
int2
coords_bias0
;
int2
coords_bias1
;
int2
coords_bias2
;
int2
coords_bias3
;
/*
if
(
x
==
0
&&
y
==
0
)
{
half4
b
=
(
half4
)
{0,
0
,
0
,
0}
;
#
define
PPI
(
j,
k
)
\
b
=
read_imageh
(
bias,
sampler,
(
int2
)
{j,
k}
)
; \
printf
(
"bias(%d,%d)={ %f , %f , %f , %f }\n "
,
j,
k,
convert_float
(
b.x
)
,
\
convert_float
(
b.y
)
,
convert_float
(
b.z
)
,
convert_float
(
b.w
))
;
for
(
int
i
=
0
; i < 73; ++i) {
PPI
(
i,
0
)
;
}
#
undef
PPI
}*/
coords_bias0.x
=
x
/
w
*
4
;
coords_bias0.y
=
0
;
coords_bias1.x
=
x
/
w
*
4
+
1
;
coords_bias1.y
=
0
;
coords_bias2.x
=
x
/
w
*
4
+
2
;
coords_bias2.y
=
0
;
coords_bias3.x
=
x
/
w
*
4
+
3
;
coords_bias3.y
=
0
;
half4
biase0
=
read_imageh
(
bias,
sampler,
coords_bias0
)
;
half4
biase1
=
read_imageh
(
bias,
sampler,
coords_bias1
)
;
half4
biase2
=
read_imageh
(
bias,
sampler,
coords_bias2
)
;
half4
biase3
=
read_imageh
(
bias,
sampler,
coords_bias3
)
;
/*
if
(
x
==
0
&&
y
==
0
)
{
printf
(
"bias0={ %f , %f , %f , %f }\n "
,
convert_float
(
biase0.x
)
,
convert_float
(
biase0.y
)
,
convert_float
(
biase0.z
)
,
convert_float
(
biase0.w
))
;
printf
(
"bias1={ %f , %f , %f , %f }\n "
,
convert_float
(
biase1.x
)
,
convert_float
(
biase1.y
)
,
convert_float
(
biase1.z
)
,
convert_float
(
biase1.w
))
;
printf
(
"bias2={ %f , %f , %f , %f }\n "
,
convert_float
(
biase2.x
)
,
convert_float
(
biase2.y
)
,
convert_float
(
biase2.z
)
,
convert_float
(
biase2.w
))
;
printf
(
"bias3={ %f , %f , %f , %f }\n "
,
convert_float
(
biase3.x
)
,
convert_float
(
biase3.y
)
,
convert_float
(
biase3.z
)
,
convert_float
(
biase3.w
))
;
}*/
half4
biase
=
{biase0.x,
biase1.x,
biase2.x,
biase3.x}
;
half4
in
=
read_imageh
(
input,
sampler,
coords
)
;
half4
output
=
mad
(
in,
biase,
0
)
;
write_imageh
(
outputImage,
coords,
output
)
;
}
\ No newline at end of file
mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
浏览文件 @
6554854a
...
...
@@ -174,6 +174,16 @@ bool ConvAddBNReluKernel<GPU_CL, float>::Init(
build_options
);
}
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
!=
3
)
{
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
// other depthwise not with filter 3x3
DLOG
<<
"depth_conv basic "
;
param
->
ExecMode
()
=
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
;
this
->
cl_helper_
.
AddKernel
(
"depth_conv"
,
conv_kernel_file
,
build_options
);
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
// if (param->Strides()[0] == param->Strides()[1] &&
...
...
@@ -214,6 +224,7 @@ void ConvAddBNReluKernel<GPU_CL, float>::Compute(
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW1x1_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
:
ConvAddBnRelu
(
&
this
->
cl_helper_
,
param
,
true
,
param
.
Bias
(),
param
.
NewScale
(),
param
.
NewBias
());
break
;
...
...
mobile/src/operators/kernel/cl/conv_add_kernel.cpp
浏览文件 @
6554854a
...
...
@@ -71,6 +71,14 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
build_options
);
}
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
!=
3
)
{
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
param
->
ExecMode
()
=
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
;
this
->
cl_helper_
.
AddKernel
(
"depth_conv"
,
conv_kernel_file
,
build_options
);
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
// if (param->Strides()[0] == param->Strides()[1] &&
...
...
@@ -124,6 +132,7 @@ void ConvAddKernel<GPU_CL, float>::Compute(
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW1x1_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW5x5_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
:
ConvAddBnRelu
(
&
this
->
cl_helper_
,
param
,
false
,
param
.
Bias
());
break
;
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW7x7_FLOAT
:
...
...
mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
浏览文件 @
6554854a
...
...
@@ -72,6 +72,14 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
build_options
);
}
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
!=
3
)
{
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
DLOG
<<
"init depwise conv basic"
;
param
->
ExecMode
()
=
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
;
this
->
cl_helper_
.
AddKernel
(
"depth_conv"
,
conv_kernel_file
,
build_options
);
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
// if (param->Strides()[0] == param->Strides()[1] &&
...
...
@@ -130,6 +138,7 @@ void ConvAddReluKernel<GPU_CL, float>::Compute(
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW5x5_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW7x7_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
:
ConvAddBnRelu
(
&
this
->
cl_helper_
,
param
,
true
,
param
.
Bias
());
break
;
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
...
...
mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
浏览文件 @
6554854a
...
...
@@ -129,6 +129,14 @@ bool ConvBNReluKernel<GPU_CL, float>::Init(
build_options
);
}
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
!=
3
)
{
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
param
->
ExecMode
()
=
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
;
this
->
cl_helper_
.
AddKernel
(
"depth_conv"
,
conv_kernel_file
,
build_options
);
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
// if (param->Strides()[0] == param->Strides()[1] &&
...
...
@@ -168,6 +176,7 @@ void ConvBNReluKernel<GPU_CL, float>::Compute(
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW1x1_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
:
ConvAddBnRelu
(
&
this
->
cl_helper_
,
param
,
true
,
nullptr
,
param
.
NewScale
(),
param
.
NewBias
());
break
;
...
...
mobile/src/operators/kernel/cl/conv_kernel.cpp
浏览文件 @
6554854a
...
...
@@ -66,6 +66,14 @@ bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
}
DLOG
<<
"depth_conv 3x3"
;
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
!=
3
)
{
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
param
->
ExecMode
()
=
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
;
this
->
cl_helper_
.
AddKernel
(
"depth_conv"
,
conv_kernel_file
);
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
// if (param->Strides()[0] == param->Strides()[1] &&
...
...
@@ -115,6 +123,7 @@ void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> ¶m) {
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW7x7_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
:
ConvAddBnRelu
(
&
this
->
cl_helper_
,
param
);
break
;
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
...
...
mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
浏览文件 @
6554854a
...
...
@@ -72,6 +72,14 @@ bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
DLOG
<<
"depth_conv 3x3"
;
}
else
if
(
param
->
Filter
()
->
dims
()[
1
]
==
1
&&
param
->
Input
()
->
dims
()[
1
]
==
param
->
Output
()
->
dims
()[
1
]
&&
param
->
Filter
()
->
dims
()[
2
]
!=
3
)
{
param
->
Filter
()
->
InitDWImage
(
cl_helper_
.
CLContext
(),
cl_helper_
.
CLCommandQueue
());
param
->
ExecMode
()
=
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
;
this
->
cl_helper_
.
AddKernel
(
"depth_conv"
,
conv_kernel_file
,
build_options
);
}
else
if
(
param
->
Filter
()
->
dims
()[
2
]
==
3
&&
param
->
Filter
()
->
dims
()[
3
]
==
3
)
{
// if (param->Strides()[0] == param->Strides()[1] &&
...
...
@@ -120,6 +128,7 @@ void ConvReluKernel<GPU_CL, float>::Compute(
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW1x1_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_SLIDINGWINDOW3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3_FLOAT
:
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISEBASIC_FLOAT
:
ConvAddBnRelu
(
&
this
->
cl_helper_
,
param
,
true
);
break
;
case
ConvParam
<
GPU_CL
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
...
...
mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
浏览文件 @
6554854a
...
...
@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef ELEMENTWISEMUL_OP
#include "operators/kernel/elementwise_mul_kernel.h"
#include <framework/cl/cl_half.h>
#include <iostream>
#include "framework/cl/cl_image.h"
namespace
paddle_mobile
{
...
...
@@ -23,19 +25,24 @@ namespace operators {
template
<
>
bool
ElementwiseMulKernel
<
GPU_CL
,
float
>::
Init
(
ElementwiseMulParam
<
GPU_CL
>
*
param
)
{
DLOG
<<
"-----init add-----"
;
framework
::
CLImage
*
bias
=
reinterpret_cast
<
framework
::
CLImage
*>
(
const_cast
<
framework
::
CLImage
*>
(
param
->
InputY
()));
if
(
bias
->
dims
()
==
param
->
InputX
()
->
dims
())
{
DLOG
<<
"init element wise mul"
;
this
->
cl_helper_
.
AddKernel
(
"elementwise_mul"
,
"elementwise_mul_kernel.cl"
);
}
else
if
(
bias
->
dims
().
size
()
==
4
)
{
}
else
if
(
bias
->
dims
().
size
()
==
1
)
{
DLOG
<<
"init channel_mul"
;
this
->
cl_helper_
.
AddKernel
(
"channel_mul"
,
"elementwise_mul_kernel.cl"
);
}
else
if
(
bias
->
dims
().
size
()
==
2
)
{
// etc. input 1 72 28 28
// filter 1 72
DLOG
<<
"init channel_mul_d2"
;
this
->
cl_helper_
.
AddKernel
(
"channel_mul_d2"
,
"elementwise_mul_kernel.cl"
);
}
else
{
DLOG
<<
"error:bias dims is error"
;
PADDLE_MOBILE_ENFORCE
(
false
,
"element mul not supported yet"
)
;
}
return
true
;
}
template
<
>
void
ElementwiseMulKernel
<
GPU_CL
,
float
>::
Compute
(
const
ElementwiseMulParam
<
GPU_CL
>
&
param
)
{
...
...
@@ -64,8 +71,8 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
else
if
(
bias
->
dims
().
size
()
==
4
)
{
DLOG
<<
"
zp7 444
"
;
}
else
if
(
bias
->
dims
().
size
()
==
1
)
{
DLOG
<<
"
channel mul
"
;
cl_mem
input_image
=
input
->
GetCLImage
();
cl_mem
bias_image
=
bias
->
GetCLImage
();
cl_mem
output_image
=
output
->
GetCLImage
();
...
...
@@ -84,14 +91,48 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
CL_CHECK_ERRORS
(
status
);
auto
width
=
input
->
ImageWidth
();
auto
height
=
input
->
ImageHeight
();
DLOG
<<
"dede:"
<<
width
<<
","
<<
height
;
size_t
global_work_size
[
2
]
=
{
width
,
height
};
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
}
else
if
(
bias
->
dims
().
size
()
==
2
)
{
DLOG
<<
"channel mul d2"
;
// etc. input 1 72 28 28
// filter 1 72 --> 1 1 1 72
DLOG
<<
"input->ImageDims(): "
<<
input
->
ImageDims
();
DLOG
<<
"bias->ImageDims(): "
<<
bias
->
ImageDims
();
DLOG
<<
"out->ImageDims(): "
<<
output
->
ImageDims
();
DLOG
<<
"channel mul d2"
;
cl_mem
input_image
=
input
->
GetCLImage
();
cl_mem
bias_image
=
bias
->
GetCLImage
();
cl_mem
output_image
=
output
->
GetCLImage
();
int
tensor_w
=
input
->
dims
()[
input
->
dims
().
size
()
-
1
];
status
=
clSetKernelArg
(
kernel
,
0
,
sizeof
(
cl_mem
),
reinterpret_cast
<
void
*>
(
&
input_image
));
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
1
,
sizeof
(
cl_mem
),
reinterpret_cast
<
void
*>
(
&
bias_image
));
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
2
,
sizeof
(
cl_mem
),
reinterpret_cast
<
void
*>
(
&
output_image
));
CL_CHECK_ERRORS
(
status
);
status
=
clSetKernelArg
(
kernel
,
3
,
sizeof
(
cl_int
),
reinterpret_cast
<
void
*>
(
&
tensor_w
));
CL_CHECK_ERRORS
(
status
);
auto
width
=
input
->
ImageWidth
();
auto
height
=
input
->
ImageHeight
();
size_t
global_work_size
[
2
]
=
{
width
,
height
};
status
=
clEnqueueNDRangeKernel
(
this
->
cl_helper_
.
CLCommandQueue
(),
kernel
,
2
,
NULL
,
global_work_size
,
NULL
,
0
,
NULL
,
NULL
);
CL_CHECK_ERRORS
(
status
);
// bias->PrintTensor(*bias);
}
else
{
DLOG
<<
"error:bias dims is error"
;
PADDLE_MOBILE_ENFORCE
(
false
,
"element mul not support this situation yet"
)
}
}
...
...
mobile/src/operators/op_param.h
浏览文件 @
6554854a
...
...
@@ -489,6 +489,7 @@ class ConvParam : public OpParam {
EXEC_SLIDINGWINDOW5x5_FLOAT
,
EXEC_SLIDINGWINDOW7x7_FLOAT
,
EXEC_GEMM1x1s1_FLOAT
,
EXEC_DEPTHWISEBASIC_FLOAT
,
};
ExecMode
&
ExecMode
()
const
{
return
exec_mode_
;
}
...
...
mobile/test/net/test_net_multi_feed.cpp
浏览文件 @
6554854a
...
...
@@ -216,4 +216,6 @@ void test(int argc, char *argv[]) {
std
::
cout
<<
std
::
endl
;
}
}
#else
int
main
()
{}
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录